python生成lmdb格式的文件实例
在crnn训练的时候需要用到lmdb格式的数据集,下面是python生成lmdb个是数据集的代码,注意一定要在linux系统下,否则会读入图像的时候出问题,可能遇到的问题都在代码里面注释了,看代码即可。
#-*-coding:utf-8-*-
importos
importlmdb#先pipinstall这个模块哦
importcv2
importglob
importnumpyasnp
defcheckImageIsValid(imageBin):
ifimageBinisNone:
returnFalse
imageBuf=np.fromstring(imageBin,dtype=np.uint8)
img=cv2.imdecode(imageBuf,cv2.IMREAD_GRAYSCALE)
ifimgisNone:
returnFalse
imgH,imgW=img.shape[0],img.shape[1]
ifimgH*imgW==0:
returnFalse
returnTrue
defwriteCache(env,cache):
withenv.begin(write=True)astxn:
fork,vincache.iteritems():
txn.put(k,v)
defcreateDataset(outputPath,imagePathList,labelList,lexiconList=None,checkValid=True):
"""
CreateLMDBdatasetforCRNNtraining.
#ARGS:
outputPath:LMDBoutputpath
imagePathList:listofimagepath
labelList:listofcorrespondinggroundtruthtexts
lexiconList:(optional)listoflexiconlists
checkValid:iftrue,checkthevalidityofeveryimage
"""
#print(len(imagePathList),len(labelList))
assert(len(imagePathList)==len(labelList))
nSamples=len(imagePathList)
print'...................'
env=lmdb.open(outputPath,map_size=8589934592)#1099511627776)所需要的磁盘空间的最小值,之前是1T,我改成了8g,否则会报磁盘空间不足,这个数字是字节
cache={}
cnt=1
foriinxrange(nSamples):
imagePath=imagePathList[i]
label=labelList[i]
ifnotos.path.exists(imagePath):
print('%sdoesnotexist'%imagePath)
continue
withopen(imagePath,'r')asf:
imageBin=f.read()
ifcheckValid:
ifnotcheckImageIsValid(imageBin):
print('%sisnotavalidimage'%imagePath)#注意一定要在linux下,否则f.read就不可用了,就会输出这个信息
continue
imageKey='image-%09d'%cnt
labelKey='label-%09d'%cnt
cache[imageKey]=imageBin
cache[labelKey]=label
iflexiconList:
lexiconKey='lexicon-%09d'%cnt
cache[lexiconKey]=''.join(lexiconList[i])
ifcnt%1000==0:
writeCache(env,cache)
cache={}
print('Written%d/%d'%(cnt,nSamples))
cnt+=1
nSamples=cnt-1
cache['num-samples']=str(nSamples)
writeCache(env,cache)
print('Createddatasetwith%dsamples'%nSamples)
defread_text(path):
withopen(path)asf:
text=f.read()
text=text.strip()
returntext
if__name__=='__main__':
#lmdb输出目录
outputPath='D:/ruanjianxiazai/tuxiangyangben/fengehou/train'#训练集和验证集要跑两遍这个程序,分两次生成
path="D:/ruanjianxiazai/tuxiangyangben/fengehou/chenguang/*.jpg"#将txt与jpg的都放在同一个文件里面
imagePathList=glob.glob(path)
print'------------',len(imagePathList),'------------'
imgLabelLists=[]
forpinimagePathList:
try:
imgLabelLists.append((p,read_text(p.replace('.jpg','.txt'))))
except:
continue
#imgLabelList=[(p,read_text(p.replace('.jpg','.txt')))forpinimagePathList]
#sortbylabelList
imgLabelList=sorted(imgLabelLists,key=lambdax:len(x[1]))
imgPaths=[p[0]forpinimgLabelList]
txtLists=[p[1]forpinimgLabelList]
createDataset(outputPath,imgPaths,txtLists,lexiconList=None,checkValid=True)
以上这篇python生成lmdb格式的文件实例就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持毛票票。