python实现将html表格转换成CSV文件的方法
本文实例讲述了python实现将html表格转换成CSV文件的方法。分享给大家供大家参考。具体如下:
使用方法:pythonhtml2csv.py*.html
这段代码使用了HTMLParser模块
#!/usr/bin/python #-*-coding:iso-8859-1-*- #Hello,thisprogramiswritteninPython-http://python.org programname='html2csv-version2002-09-20-http://sebsauvage.net' importsys,getopt,os.path,glob,HTMLParser,re try:importpsyco;psyco.jit()#Ifpresent,usepsycotoacceleratetheprogram except:pass defusage(progname): '''Displayprogramusage.''' progname=os.path.split(progname)[1] ifos.path.splitext(progname)[1]in['.py','.pyc']:progname='python'+progname return'''%s AcoarseHTMLtablestoCSV(Comma-SeparatedValues)converter. Syntax:%ssource.html Arguments:source.htmlistheHTMLfileyouwanttoconverttoCSV. Bydefault,thefilewillbeconvertedtocsvwiththesame nameandthecsvextension(source.html->source.csv) Youcanuse*and?. Examples:%smypage.html :%s*.html Thisprogramispublicdomain. Author:SebastienSAUVAGE<sebsauvageatsebsauvagedotnet> http://sebsauvage.net '''%(programname,progname,progname,progname) classhtml2csv(HTMLParser.HTMLParser): '''AbasicparserwhichconvertsHTMLtablesintoCSV. FeedHTMLwithfeed().GetCSVwithgetCSV().(Seeexamplebelow.) AlltablesinHTMLwillbeconvertedtoCSV(intheordertheyoccur intheHTMLfile). YoucanprocessverylargeHTMLfilesbyfeedingthisclasswithchunks ofhtmlwhilegettingchunksofCSVbycallinggetCSV(). Shouldhandlebadlyformatedhtml(missing<tr>,</tr>,</td>, extraneous</td>,</tr>...). ThisparserusesHTMLParserfromtheHTMLParsermodule, notHTMLParserfromthehtmllibmodule. Example:parser=html2csv() parser.feed(open('mypage.html','rb').read()) open('mytables.csv','w+b').write(parser.getCSV()) Thisclassispublicdomain. Author:SébastienSAUVAGE<sebsauvageatsebsauvagedotnet> http://sebsauvage.net Versions: 2002-09-19:-Firstversion 2002-09-20:-nowusesHTMLParser.HTMLParserinsteadofhtmllib.HTMLParser. -nowparsescommand-line. Todo: -handle<PRE>tags -converthtmlentities(&name;and&#ref;)toAscii. ''' def__init__(self): HTMLParser.HTMLParser.__init__(self) self.CSV=''#TheCSVdata self.CSVrow=''#ThecurrentCSVrowbeeingconstructedfromHTML self.inTD=0#Usedtotrackifweareinsideoroutsidea<TD>...</TD>tag. self.inTR=0#Usedtotrackifweareinsideoroutsidea<TR>...</TR>tag. self.re_multiplespaces=re.compile('\s+')#regularexpressionusedtoremovespacesinexcess self.rowCount=0#CSVoutputlinecounter. defhandle_starttag(self,tag,attrs): iftag=='tr':self.start_tr() eliftag=='td':self.start_td() defhandle_endtag(self,tag): iftag=='tr':self.end_tr() eliftag=='td':self.end_td() defstart_tr(self): ifself.inTR:self.end_tr()#<TR>implies</TR> self.inTR=1 defend_tr(self): ifself.inTD:self.end_td()#</TR>implies</TD> self.inTR=0 iflen(self.CSVrow)>0: self.CSV+=self.CSVrow[:-1] self.CSVrow='' self.CSV+='\n' self.rowCount+=1 defstart_td(self): ifnotself.inTR:self.start_tr()#<TD>implies<TR> self.CSVrow+='"' self.inTD=1 defend_td(self): ifself.inTD: self.CSVrow+='",' self.inTD=0 defhandle_data(self,data): ifself.inTD: self.CSVrow+=self.re_multiplespaces.sub('',data.replace('\t','').replace('\n','').replace('\r','').replace('"','""')) defgetCSV(self,purge=False): '''GetoutputCSV. Ifpurgeistrue,getCSV()willreturnallremainingdata, evenif<td>or<tr>arenotproperlyclosed. (YouwouldtypicallycallgetCSVwithpurge=Truewhenyoudonothave anymoreHTMLtofeedandyoususpectdirtyHTML(unclosedtags).''' ifpurgeandself.inTR:self.end_tr()#Thiswillalsoend_tdandappendlastCSVrowtooutputCSV. dataout=self.CSV[:] self.CSV='' returndataout if__name__=="__main__": try:#Putgetoptinplaceforfutureusage. opts,args=getopt.getopt(sys.argv[1:],None) exceptgetopt.GetoptError: printusage(sys.argv[0])#printhelpinformationandexit: sys.exit(2) iflen(args)==0: printusage(sys.argv[0])#printhelpinformationandexit: sys.exit(2) printprogramname html_files=glob.glob(args[0]) forhtmlfilenameinhtml_files: outputfilename=os.path.splitext(htmlfilename)[0]+'.csv' parser=html2csv() print'Reading%s,writing%s...'%(htmlfilename,outputfilename) try: htmlfile=open(htmlfilename,'rb') csvfile=open(outputfilename,'w+b') data=htmlfile.read(8192) whiledata: parser.feed(data) csvfile.write(parser.getCSV()) sys.stdout.write('%dCSVrowswritten.\r'%parser.rowCount) data=htmlfile.read(8192) csvfile.write(parser.getCSV(True)) csvfile.close() htmlfile.close() except: print'Errorconverting%s'%htmlfilename try:htmlfile.close() except:pass try:csvfile.close() except:pass print'Alldone.'
希望本文所述对大家的Python程序设计有所帮助。