识别率很高的java文字识别技术
java文字识别程序的关键是寻找一个可以调用的OCR引擎。tesseract-ocr就是一个这样的OCR引擎,在1985年到1995年由HP实验室开发,现在在Google。tesseract-ocr3.0发布,支持中文。不过tesseract-ocr3.0不是图形化界面的客户端,别人写的FreeOCR图形化客户端还不支持导入新的3.0traineddata。但这标志着,现在有自由的中文OCR软件了。
java中使用tesseract-ocr3.01的步骤如下:
1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)
2.在安装向导中可以选择需要下载的语言包。
3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar
4.java程序清单:
ImageIOHelper类:
importjava.awt.image.BufferedImage; importjava.io.File; importjava.io.IOException; importjava.util.Iterator; importjava.util.Locale; importjavax.imageio.IIOImage; importjavax.imageio.ImageIO; importjavax.imageio.ImageReader; importjavax.imageio.ImageWriteParam; importjavax.imageio.ImageWriter; importjavax.imageio.metadata.IIOMetadata; importjavax.imageio.stream.ImageInputStream; importjavax.imageio.stream.ImageOutputStream; importcom.sun.media.imageio.plugins.tiff.TIFFImageWriteParam; publicclassImageIOHelper{ publicstaticFilecreateImage(FileimageFile,StringimageFormat){ FiletempFile=null; try{ Iteratorreaders=ImageIO.getImageReadersByFormatName(imageFormat); ImageReaderreader=readers.next(); ImageInputStreamiis=ImageIO.createImageInputStream(imageFile); reader.setInput(iis); //Readthestreammetadata IIOMetadatastreamMetadata=reader.getStreamMetadata(); //SetupthewriteParam TIFFImageWriteParamtiffWriteParam=newTIFFImageWriteParam(Locale.CHINESE); tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED); //Gettifwriterandsetoutputtofile Iteratorwriters=ImageIO.getImageWritersByFormatName("tiff"); ImageWriterwriter=writers.next(); BufferedImagebi=reader.read(0); IIOImageimage=newIIOImage(bi,null,reader.getImageMetadata(0)); tempFile=tempImageFile(imageFile); ImageOutputStreamios=ImageIO.createImageOutputStream(tempFile); writer.setOutput(ios); writer.write(streamMetadata,image,tiffWriteParam); ios.close(); writer.dispose(); reader.dispose(); }catch(IOExceptione){ e.printStackTrace(); } returntempFile; } privatestaticFiletempImageFile(FileimageFile){ Stringpath=imageFile.getPath(); StringBufferstrB=newStringBuffer(path); strB.insert(path.lastIndexOf('.'),0); returnnewFile(strB.toString().replaceFirst("(?<=//.)(//w+)$","tif")); } }
OCR类:
packagecom.hhp.util; importjava.io.BufferedReader; importjava.io.File; importjava.io.FileInputStream; importjava.io.InputStreamReader; importjava.util.ArrayList; importjava.util.List; importorg.jdesktop.swingx.util.OS; publicclassOCR{ privatefinalStringLANG_OPTION="-l";//英文字母小写l,并非数字1 privatefinalStringEOL=System.getProperty("line.separator"); privateStringtessPath="C://ProgramFiles(x86)//Tesseract-OCR"; //privateStringtessPath=newFile("tesseract").getAbsolutePath(); publicStringrecognizeText(FileimageFile,StringimageFormat)throwsException{ FiletempImage=ImageIOHelper.createImage(imageFile,imageFormat); FileoutputFile=newFile(imageFile.getParentFile(),"output"); StringBufferstrB=newStringBuffer(); Listcmd=newArrayList(); if(OS.isWindowsXP()){ cmd.add(tessPath+"//tesseract"); }elseif(OS.isLinux()){ cmd.add("tesseract"); }else{ cmd.add(tessPath+"//tesseract"); } cmd.add(""); cmd.add(outputFile.getName()); cmd.add(LANG_OPTION); cmd.add("chi_sim"); //cmd.add("eng"); ProcessBuilderpb=newProcessBuilder(); pb.directory(imageFile.getParentFile()); cmd.set(1,tempImage.getName()); pb.command(cmd); pb.redirectErrorStream(true); Processprocess=pb.start(); //tesseract.exe1.jpg1-lchi_sim intw=process.waitFor(); //删除临时正在工作文件 tempImage.delete(); if(w==0){ BufferedReaderin=newBufferedReader(newInputStreamReader(newFileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8")); Stringstr; while((str=in.readLine())!=null){ strB.append(str).append(EOL); } in.close(); }else{ Stringmsg; switch(w){ case1: msg="Errorsaccessingfiles.Theremaybespacesinyourimage'sfilename."; break; case29: msg="Cannotrecongnizetheimageoritsselectedregion."; break; case31: msg="Unsupportedimageformat."; break; default: msg="Errorsoccurred."; } tempImage.delete(); thrownewRuntimeException(msg); } newFile(outputFile.getAbsolutePath()+".txt").delete(); returnstrB.toString(); } }
测试类TestOCR:
importjava.io.File; importjava.io.IOException; importcom.hhp.util.OCR; publicclassOcrTest{ publicstaticvoidmain(String[]args){ Stringpath="C://temp//OCRcode//4.png"; System.out.println("ORCTestBegin......"); try{ StringvalCode=newOCR().recognizeText(newFile(path),"png"); System.out.println(valCode); }catch(IOExceptione){ e.printStackTrace(); }catch(Exceptione){ e.printStackTrace(); } System.out.println("ORCTestEnd......"); } }
经过测试,tesseract-ocr3.01的文字识别率很高,对于网站中常见的验证码识别率也很高。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。