识别率很高的java文字识别技术
java文字识别程序的关键是寻找一个可以调用的OCR引擎。tesseract-ocr就是一个这样的OCR引擎,在1985年到1995年由HP实验室开发,现在在Google。tesseract-ocr3.0发布,支持中文。不过tesseract-ocr3.0不是图形化界面的客户端,别人写的FreeOCR图形化客户端还不支持导入新的3.0traineddata。但这标志着,现在有自由的中文OCR软件了。
java中使用tesseract-ocr3.01的步骤如下:
1.下载安装tesseract-ocr-setup-3.01-1.exe(3.0以上版本才增加了中文识别)
2.在安装向导中可以选择需要下载的语言包。
3.到网上搜索下载java图形处理所需的2个包:jai_imageio-1.1-alpha.jar,swingx-1.6.1.jar
4.java程序清单:
ImageIOHelper类:
importjava.awt.image.BufferedImage;
importjava.io.File;
importjava.io.IOException;
importjava.util.Iterator;
importjava.util.Locale;
importjavax.imageio.IIOImage;
importjavax.imageio.ImageIO;
importjavax.imageio.ImageReader;
importjavax.imageio.ImageWriteParam;
importjavax.imageio.ImageWriter;
importjavax.imageio.metadata.IIOMetadata;
importjavax.imageio.stream.ImageInputStream;
importjavax.imageio.stream.ImageOutputStream;
importcom.sun.media.imageio.plugins.tiff.TIFFImageWriteParam;
publicclassImageIOHelper{
publicstaticFilecreateImage(FileimageFile,StringimageFormat){
FiletempFile=null;
try{
Iteratorreaders=ImageIO.getImageReadersByFormatName(imageFormat);
ImageReaderreader=readers.next();
ImageInputStreamiis=ImageIO.createImageInputStream(imageFile);
reader.setInput(iis);
//Readthestreammetadata
IIOMetadatastreamMetadata=reader.getStreamMetadata();
//SetupthewriteParam
TIFFImageWriteParamtiffWriteParam=newTIFFImageWriteParam(Locale.CHINESE);
tiffWriteParam.setCompressionMode(ImageWriteParam.MODE_DISABLED);
//Gettifwriterandsetoutputtofile
Iteratorwriters=ImageIO.getImageWritersByFormatName("tiff");
ImageWriterwriter=writers.next();
BufferedImagebi=reader.read(0);
IIOImageimage=newIIOImage(bi,null,reader.getImageMetadata(0));
tempFile=tempImageFile(imageFile);
ImageOutputStreamios=ImageIO.createImageOutputStream(tempFile);
writer.setOutput(ios);
writer.write(streamMetadata,image,tiffWriteParam);
ios.close();
writer.dispose();
reader.dispose();
}catch(IOExceptione){
e.printStackTrace();
}
returntempFile;
}
privatestaticFiletempImageFile(FileimageFile){
Stringpath=imageFile.getPath();
StringBufferstrB=newStringBuffer(path);
strB.insert(path.lastIndexOf('.'),0);
returnnewFile(strB.toString().replaceFirst("(?<=//.)(//w+)$","tif"));
}
}
OCR类:
packagecom.hhp.util;
importjava.io.BufferedReader;
importjava.io.File;
importjava.io.FileInputStream;
importjava.io.InputStreamReader;
importjava.util.ArrayList;
importjava.util.List;
importorg.jdesktop.swingx.util.OS;
publicclassOCR{
privatefinalStringLANG_OPTION="-l";//英文字母小写l,并非数字1
privatefinalStringEOL=System.getProperty("line.separator");
privateStringtessPath="C://ProgramFiles(x86)//Tesseract-OCR";
//privateStringtessPath=newFile("tesseract").getAbsolutePath();
publicStringrecognizeText(FileimageFile,StringimageFormat)throwsException{
FiletempImage=ImageIOHelper.createImage(imageFile,imageFormat);
FileoutputFile=newFile(imageFile.getParentFile(),"output");
StringBufferstrB=newStringBuffer();
Listcmd=newArrayList();
if(OS.isWindowsXP()){
cmd.add(tessPath+"//tesseract");
}elseif(OS.isLinux()){
cmd.add("tesseract");
}else{
cmd.add(tessPath+"//tesseract");
}
cmd.add("");
cmd.add(outputFile.getName());
cmd.add(LANG_OPTION);
cmd.add("chi_sim");
//cmd.add("eng");
ProcessBuilderpb=newProcessBuilder();
pb.directory(imageFile.getParentFile());
cmd.set(1,tempImage.getName());
pb.command(cmd);
pb.redirectErrorStream(true);
Processprocess=pb.start();
//tesseract.exe1.jpg1-lchi_sim
intw=process.waitFor();
//删除临时正在工作文件
tempImage.delete();
if(w==0){
BufferedReaderin=newBufferedReader(newInputStreamReader(newFileInputStream(outputFile.getAbsolutePath()+".txt"),"UTF-8"));
Stringstr;
while((str=in.readLine())!=null){
strB.append(str).append(EOL);
}
in.close();
}else{
Stringmsg;
switch(w){
case1:
msg="Errorsaccessingfiles.Theremaybespacesinyourimage'sfilename.";
break;
case29:
msg="Cannotrecongnizetheimageoritsselectedregion.";
break;
case31:
msg="Unsupportedimageformat.";
break;
default:
msg="Errorsoccurred.";
}
tempImage.delete();
thrownewRuntimeException(msg);
}
newFile(outputFile.getAbsolutePath()+".txt").delete();
returnstrB.toString();
}
}
测试类TestOCR:
importjava.io.File;
importjava.io.IOException;
importcom.hhp.util.OCR;
publicclassOcrTest{
publicstaticvoidmain(String[]args){
Stringpath="C://temp//OCRcode//4.png";
System.out.println("ORCTestBegin......");
try{
StringvalCode=newOCR().recognizeText(newFile(path),"png");
System.out.println(valCode);
}catch(IOExceptione){
e.printStackTrace();
}catch(Exceptione){
e.printStackTrace();
}
System.out.println("ORCTestEnd......");
}
}
经过测试,tesseract-ocr3.01的文字识别率很高,对于网站中常见的验证码识别率也很高。
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。