java实现查找PDF关键字所在页码及其坐标
1、因为最近有这方面的需求,用过之后记录一下。
2、此功能跟PDF中Ctrl+F性质一样,如果PDF中为图片形式的不支持定位到关键字。
importcom.itextpdf.awt.geom.Rectangle2D.Float; importcom.itextpdf.text.pdf.PdfDictionary; importcom.itextpdf.text.pdf.PdfName; importcom.itextpdf.text.pdf.PdfReader; importcom.itextpdf.text.pdf.parser.*; importjava.io.File; importjava.io.FileInputStream; importjava.io.IOException; importjava.util.ArrayList; importjava.util.List; /** *消失的太阳 */ publicclassMyTest{ publicstaticvoidmain(String[]args)throwsIOException{ //1.给定文件 FilepdfFile=newFile("D://test.pdf"); //2.定义一个byte数组,长度为文件的长度 byte[]pdfData=newbyte[(int)pdfFile.length()]; //3.IO流读取文件内容到byte数组 FileInputStreaminputStream=null; try{ inputStream=newFileInputStream(pdfFile); inputStream.read(pdfData); }catch(IOExceptione){ throwe; }finally{ if(inputStream!=null){ try{ inputStream.close(); }catch(IOExceptione){ } } } //4.指定关键字 Stringkeyword="消失的太阳:"; //5.调用方法,给定关键字和文件 Listpositions=findKeywordPostions(pdfData,keyword); //6.返回值类型是List 每个list元素代表一个匹配的位置,分别为float[0]所在页码float[1]所在x轴float[2]所在y轴 System.out.println("total:"+positions.size()); if(positions!=null&&positions.size()>0){ for(float[]position:positions){ System.out.print("pageNum:"+(int)position[0]); System.out.print("\tx:"+position[1]); System.out.println("\ty:"+position[2]); } } } /** *findKeywordPostions *@parampdfData通过IO流PDF文件转化的byte数组 *@paramkeyword关键字 *@returnList :float[0]:pageNumfloat[1]:xfloat[2]:y *@throwsIOException */ publicstaticList findKeywordPostions(byte[]pdfData,Stringkeyword)throwsIOException{ List result=newArrayList<>(); List pdfPageContentPositions=getPdfContentPostionsList(pdfData); for(PdfPageContentPositionspdfPageContentPosition:pdfPageContentPositions){ List charPositions=findPositions(keyword,pdfPageContentPosition); if(charPositions==null||charPositions.size()<1){ continue; } result.addAll(charPositions); } returnresult; } privatestaticList getPdfContentPostionsList(byte[]pdfData)throwsIOException{ PdfReaderreader=newPdfReader(pdfData); List result=newArrayList<>(); intpages=reader.getNumberOfPages(); for(intpageNum=1;pageNum<=pages;pageNum++){ floatwidth=reader.getPageSize(pageNum).getWidth(); floatheight=reader.getPageSize(pageNum).getHeight(); PdfRenderListenerpdfRenderListener=newPdfRenderListener(pageNum,width,height); //解析pdf,定位位置 PdfContentStreamProcessorprocessor=newPdfContentStreamProcessor(pdfRenderListener); PdfDictionarypageDic=reader.getPageN(pageNum); PdfDictionaryresourcesDic=pageDic.getAsDict(PdfName.RESOURCES); try{ processor.processContent(ContentByteUtils.getContentBytesForPage(reader,pageNum),resourcesDic); }catch(IOExceptione){ reader.close(); throwe; } Stringcontent=pdfRenderListener.getContent(); List charPositions=pdfRenderListener.getcharPositions(); List positionsList=newArrayList<>(); for(CharPositioncharPosition:charPositions){ float[]positions=newfloat[]{charPosition.getPageNum(),charPosition.getX(),charPosition.getY()}; positionsList.add(positions); } PdfPageContentPositionspdfPageContentPositions=newPdfPageContentPositions(); pdfPageContentPositions.setContent(content); pdfPageContentPositions.setPostions(positionsList); result.add(pdfPageContentPositions); } reader.close(); returnresult; } privatestaticList findPositions(Stringkeyword,PdfPageContentPositionspdfPageContentPositions){ List result=newArrayList<>(); Stringcontent=pdfPageContentPositions.getContent(); List charPositions=pdfPageContentPositions.getPositions(); for(intpos=0;pos positions; publicStringgetContent(){ returncontent; } publicvoidsetContent(Stringcontent){ this.content=content; } publicList getPositions(){ returnpositions; } publicvoidsetPostions(List positions){ this.positions=positions; } } privatestaticclassPdfRenderListenerimplementsRenderListener{ privateintpageNum; privatefloatpageWidth; privatefloatpageHeight; privateStringBuildercontentBuilder=newStringBuilder(); privateList charPositions=newArrayList<>(); publicPdfRenderListener(intpageNum,floatpageWidth,floatpageHeight){ this.pageNum=pageNum; this.pageWidth=pageWidth; this.pageHeight=pageHeight; } publicvoidbeginTextBlock(){ } publicvoidrenderText(TextRenderInforenderInfo){ List characterRenderInfos=renderInfo.getCharacterRenderInfos(); for(TextRenderInfotextRenderInfo:characterRenderInfos){ Stringword=textRenderInfo.getText(); if(word.length()>1){ word=word.substring(word.length()-1,word.length()); } Floatrectangle=textRenderInfo.getAscentLine().getBoundingRectange(); floatx=(float)rectangle.getX(); floaty=(float)rectangle.getY(); //floatx=(float)rectangle.getCenterX(); //floaty=(float)rectangle.getCenterY(); //doublex=rectangle.getMinX(); //doubley=rectangle.getMaxY(); //这两个是关键字在所在页面的XY轴的百分比 floatxPercent=Math.round(x/pageWidth*10000)/10000f; floatyPercent=Math.round((1-y/pageHeight)*10000)/10000f; //CharPositioncharPosition=newCharPosition(pageNum,xPercent,yPercent); CharPositioncharPosition=newCharPosition(pageNum,(float)x,(float)y); charPositions.add(charPosition); contentBuilder.append(word); } } publicvoidendTextBlock(){ } publicvoidrenderImage(ImageRenderInforenderInfo){ } publicStringgetContent(){ returncontentBuilder.toString(); } publicList getcharPositions(){ returncharPositions; } } privatestaticclassCharPosition{ privateintpageNum=0; privatefloatx=0; privatefloaty=0; publicCharPosition(intpageNum,floatx,floaty){ this.pageNum=pageNum; this.x=x; this.y=y; } publicintgetPageNum(){ returnpageNum; } publicfloatgetX(){ returnx; } publicfloatgetY(){ returny; } @Override publicStringtoString(){ return"[pageNum="+this.pageNum+",x="+this.x+",y="+this.y+"]"; } } }
总结
以上所述是小编给大家介绍的java实现查找PDF关键字所在页码及其坐标,希望对大家有所帮助,如果大家有任何疑问请给我留言,小编会及时回复大家的。在此也非常感谢大家对毛票票网站的支持!
如果你觉得本文对你有帮助,欢迎转载,烦请注明出处,谢谢!