java + dom4j.jar提取xml文档内容
本文实例为大家分享了java+dom4j.jar提取xml文档内容的具体代码,供大家参考,具体内容如下
资源下载页:点击下载
本例程主要借助几个遍历的操作对xml格式下的内容进行提取,操作不是最优的方法,主要是练习使用几个遍历操作。
xml格式文档内容:
-- AnEndtoNuclearTesting - - ATOMICWEAPONS NUCLEARTESTS TESTSANDTESTING EDITORIALS CLINTON,BILL(PRES) Editorial Top/Opinion Top/Opinion/Opinion Top/Opinion/Opinion/Editorials NuclearTests AtomicWeapons TestsandTesting Armament,DefenseandMilitaryForces - - - -AnEndtoNuclearTesting - Fornearlyhalfacentury,testexplosionsintheNevadadesertwereareverberatingreminderofcoldwarinsecurity.Nowthebiggestworryisnuclearproliferation,nottheSovietthreat.That'swhyPresidentClintonhasquietlydecidedtoextendthemoratoriumontestsofnucleararmsforatleast15months.
Topersuadenuclearhave-notstostayoutofthebomb-makingbusiness,itmakesmoresensetohalttestingandtrytogetotherstodolikewisethantoconductmoredemonstrationsofAmerica'sdeterrentpower.
-Fornearlyhalfacentury,testexplosionsintheNevadadesertwereareverberatingreminderofcoldwarinsecurity.Nowthebiggestworryisnuclearproliferation,nottheSovietthreat.That'swhyPresidentClintonhasquietlydecidedtoextendthemoratoriumontestsofnucleararmsforatleast15months.
Topersuadenuclearhave-notstostayoutofthebomb-makingbusiness,itmakesmoresensetohalttestingandtrytogetotherstodolikewisethantoconductmoredemonstrationsofAmerica'sdeterrentpower.
NotthatnuclearwannabeswillnecessarilyfollowAmerica'slead.Norwillanendtoalltestingassureanendtobomb-making;stateslikePakistanhavedevelopednucleardeviceswithouttestingthemfirst.
ButcallingahalttoU.S.nucleartestingmakesiteasierforleadersinRussiaandFrancetoextendthemoratoriumstheyarenowobservingandimprovetheatmosphereforpromptnegotiationofatreatytobanalltests.
Thattestbaninturnshouldshoreupinternationalsupportforthe1968NonproliferationTreaty,linchpinofeffortstostopthespreadofnucleararms,whenitcomesupforreviewin1995.Itwillalsobolsterthebackingfortightercontrolsonexportsusedinbomb-making.
Mr.Clintonhastakenthreehelpfulsteps.HehasextendedtheCongressionallymandatedmoratoriumonU.S.teststhatwasduetoexpirelastweek.HehasdeclaredthattheU.S.willnottestunlessanothernationdoessofirst.Andhewantstonegotiateatotalbanontesting.
ButthePresidentalsowantsthenuclearlabstobepreparedforapromptresumptionofwarheadsafetyandreliabilitytests.Thiscouldcostmillionsofdollarsanddoesn'tmakemuchsense,sinceinMr.Clinton'sownwords,"Afterathoroughreview,myAdministrationhasdeterminedthatthenuclearweaponsintheUnitedStates'arsenalaresafeandreliable."
Moreover,preparationsfortestingcantakeonalifeoftheirown:30yearsaftertheLimitedTestBanTreatyputanendtoabove-groundtests,theU.S.stillspends$20millionayearonSafeguardC,aprogramtokeeptestsitesready.
Americansecuritynolongerrestsonthatsortofeternalnuclearvigilance.Mr.Clinton'smoratoriummaymakeAmericasaferthanallthetestsandpreparationsforteststhatthenuclearlabscandreamup.
提取代码:
对多文件进行操作,首先遍历所有文件路径,存到遍历器中,然后对遍历器中的文件路径进行逐一操作。
packagecom.njupt.ymh;
importjava.io.File;
importjava.util.ArrayList;
importjava.util.List;
importedu.princeton.cs.algs4.In;
/**
*返回文件名列表
*@author11860
*
*/
publicclassSearchFile{
publicstaticListgetAllFile(StringdirectoryPath,booleanisAddDirectory){
Listlist=newArrayList();//存放文件路径
FilebaseFile=newFile(directoryPath);//当前路径
if(baseFile.isFile()||!baseFile.exists())
returnlist;
File[]files=baseFile.listFiles();//子文件
for(Filefile:files){
if(file.isDirectory())
{
if(isAddDirectory)//isAddDirectory是否将子文件夹的路径也添加到list集合中
list.add(file.getAbsolutePath());//全路径
list.addAll(getAllFile(file.getAbsolutePath(),isAddDirectory));
}
else
{
list.add(file.getAbsolutePath());
}
}
returnlist;
}
publicstaticvoidmain(String[]args){
//SearchFilesFile=newSearchFile();
ListlistFile=SearchFile.getAllFile("E:\\huadai",false);
System.out.println(listFile.size());
Filefile=newFile(listFile.get(3));
Inin=newIn(listFile.get(4));
while(in.hasNextLine()){
StringreadLine=in.readLine().trim();//读取当前行
System.out.println(readLine);
}
System.out.println(file.length());
}
}
packagecom.njupt.ymh;
importjava.io.File;
importjava.util.Iterator;
importjava.util.List;
importorg.dom4j.Document;
importorg.dom4j.DocumentException;
importorg.dom4j.Element;
importorg.dom4j.Node;
importorg.dom4j.io.SAXReader;
publicclassNewsPaper{
intdoc_id;//文章id
Stringdoc_title;//文章标题
Stringlead_paragraph;//文章首段
Stringfull_text;//文章内容
Stringdate;//文章日期
publicNewsPaper(Stringxml){
doc_id=-1;//文章id
doc_title=null;//文章标题
lead_paragraph=null;//文章首段
full_text=null;//文章内容
date=null;//文章日期
searchValue(xml);
}
/**
*加载Document文件
*@paramfileName
*@returnDocument
*/
privateDocumentload(StringfileName){
Documentdocument=null;//文档
SAXReadersaxReader=newSAXReader();//读取文件流
try{
document=saxReader.read(newFile(fileName));
}catch(DocumentExceptione){
e.printStackTrace();
}
returndocument;
}
/**
*获取Document的根节点
*@paramargs
*/
privateElementgetRootNode(Documentdocument){
returndocument.getRootElement();
}
/**
*获取所需节点值
*@paramxml
*/
privatevoidsearchValue(Stringxml){
Documentdocument=load(xml);
Elementroot=getRootNode(document);//根节点
//文章日期
date=xml.substring(10,20);
//文章标题
doc_title=root.valueOf("//head/title");
//文章-id
Listlist_doc_id=document.selectNodes("//doc-id/@id-string");
for(Nodeele:list_doc_id){
doc_id=Integer.parseInt(ele.getText());
}
//文章内容
for(Iteratori=root.elementIterator();i.hasNext();){
Elementel=(Element)i.next();//head、body
//对body节点进行操作
if(el.getName()=="body"){//body
for(Iteratorbody=el.elementIterator();body.hasNext();){
Elementelbody=body.next();
if(elbody.getName()=="body.content"){//body.content
for(Iteratorblock=elbody.elementIterator();block.hasNext();){
Elementblock_class=(Element)block.next();
if(block_class.attributeValue("class").equals("full_text")){//full_text
Listlist_text=block_class.selectNodes("p");
for(Nodetext:list_text)
if(full_text==null)
full_text=text.getStringValue();
else
full_text=full_text+""+text.getStringValue();
}
else{//lead_paragraph
Listlist_lead=block_class.selectNodes("p");
for(Nodelead:list_lead)
if(lead_paragraph==null)
lead_paragraph=lead.getStringValue();
else
lead_paragraph=lead_paragraph+""+lead.getStringValue();
}
}
}
}
}
}
}
/**
*获取文章标题
*@paramargs
*/
publicStringgetTitle(){
returndoc_title;
}
/**
*获取文章id
*@paramargs
*/
publicintgetID(){
returndoc_id;
}
/**
*获取文章简介
*@paramargs
*/
publicStringgetLead(){
if(getID()<394070&&lead_paragraph!=null&&lead_paragraph.length()>6)//1990-10-22之前
returnlead_paragraph.substring(6);
else//1990-10-22之后
returnlead_paragraph;
}
/**
*获取文章正文
*@paramargs
*/
publicStringgetfull(){
if(getID()<394070&&full_text!=null&&full_text.length()>6)//1990-10-22之前
returnfull_text.substring(6);
else
returnfull_text;
}
/**
*获取文章日期
*@paramargs
*/
publicStringgetDate(){
returndate;
}
/**
*判断获取的信息是否有用
*@return
*/
publicbooleanisUseful(){
if(getID()==-1)
returnfalse;
if(getDate()==null)
returnfalse;
if(getTitle()==null||getTitle().length()>=255)
returnfalse;
if(getLead()==null||getLead().length()>=65535)
returnfalse;
if(getfull()==null||getfull().length()>=65535)
returnfalse;
return!isnum();
}
/**
*挑出具有特殊开头的数字内容文章
*@return
*/
privatebooleanisnum(){
if(getfull()!=null&&getfull().length()>24){
if(getfull().substring(0,20).contains("*3***COMPANYREPORT")){//剔除数字文章
returntrue;
}
}
returnfalse;
}
publicstaticvoidmain(String[]args){
ListlistFile=SearchFile.getAllFile("E:\\huadai\\1989\\10",false);//文件列表
//Stringdate;//日期
intcount=0;
inti=0;
for(Stringstring:listFile){
NewsPapernewsPaper=newNewsPaper(string);
count++;
if(!newsPaper.isUseful()){
i++;
System.out.println(newsPaper.getLead());
}
}
System.out.println(i+""+count);
}
}
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。