__author__ = 'Directorli'
import Image
import os
import svmlight
def binary(x,y,file):
im = Image.open(file)
Lim = im.convert('L')
threshold = 80
table = []
for i in range(256):
if i <>
table.append(0)
else:
table.append(1)
# convert to binary image by the table
bim = Lim.point(table, '1')
bdata=bim.load()
out=''
for a in range(y):
for b in range(x):
out = out + str(bdata[b,a])
#out = out + '\n'
return out
def totrain(x):
path = 'bmp\\cut\\all\\'
filelist = os.listdir(path)
#print filelist
f=[]
for one in filelist:
# print path[-5]
# print binary(10,10,path)
a = binary(10,10,path+one)
d=[]
c = 1
for b in a:
d.append((c,int(b)))
c = c + 1
# print d
if str(x) == one[-5]:
e = (1,d)
else:
e = (-1,d)
f.append(e)
#print e
#print f
return f
def trainall():
for i in range(10):
training_data = totrain(i) #train set
#test_data =test #test set
# train a model based on the data
model = svmlight.learn(training_data, type='classification', verbosity=0)
svmlight.write_model(model, 'model'+str(i)) #write model
#model=svmlight.read_model('my_model.dat') #read model
#predictions = svmlight.classify(model, test_data)
生成的知识模式(model 0~9):
7、再获取一定数量的验证码图片试着识别:
因为图片太规整,识别率太高(100%) ,所以对图片手动加些处理
以下是机器命名的,识别率任然算高
上代码:
#ocr.py
#python ver 2.7
__author__ = 'Directorli'
import Image
import svmlight
def binary(x,file):
im = Image.open(file)
Lim = im.convert('L')
threshold = 80
table = []
for i in range(256):
if i <>
table.append(0)
else:
table.append(1)
# convert to binary image by the table
bim = Lim.point(table, '1')
bdata=bim.load()
out=''
for a in range(10):
for b in range(x*10,x*10+10):
out = out + str(bdata[b,a])
#out = out + '\n'
return out
def chformat(x):
f=[]
d=[]
c = 1
for b in x:
d.append((c,int(b)))
c = c + 1
e = (1,d)
f.append(e)
return f
def ocr(filename):
result=''
for num in range(4):
t = binary(num,filename)
test = chformat(t)
for i in range(10):
model=svmlight.read_model('model'+str(i)) #read model
prediction = svmlight.classify(model,test)
#print prediction
if prediction[0] >0:
#print i
result = result + str(i)
print(result)
return result
#ocr('code.bmp')
import os
allfile = os.listdir('test\\')
for a in allfile:
b = str(ocr('test\\'+a))
联系客服