python实现将html表格转换成CSV文件的方法

2024-03-30 03:08:03 40
本文实例讲述了python实现将html表格转换成CSV文件的方法。分享给大家供大家参考。具体如下：
使用方法：pythonhtml2csv.py*.html
这段代码使用了HTMLParser模块
#!/usr/bin/python
#-*-coding:iso-8859-1-*-
#Hello,thisprogramiswritteninPython-http://python.org
programname='html2csv-version2002-09-20-http://sebsauvage.net'
importsys,getopt,os.path,glob,HTMLParser,re
try:importpsyco;psyco.jit()#Ifpresent,usepsycotoacceleratetheprogram
except:pass
defusage(progname):
'''Displayprogramusage.'''
progname=os.path.split(progname)[1]
ifos.path.splitext(progname)[1]in['.py','.pyc']:progname='python'+progname
return'''%s
AcoarseHTMLtablestoCSV(Comma-SeparatedValues)converter.
Syntax:%ssource.html
Arguments:source.htmlistheHTMLfileyouwanttoconverttoCSV.
Bydefault,thefilewillbeconvertedtocsvwiththesame
nameandthecsvextension(source.html->source.csv)
Youcanuse*and?.
Examples:%smypage.html
:%s*.html
Thisprogramispublicdomain.
Author:SebastienSAUVAGE<sebsauvageatsebsauvagedotnet>
http://sebsauvage.net
'''%(programname,progname,progname,progname)
classhtml2csv(HTMLParser.HTMLParser):
'''AbasicparserwhichconvertsHTMLtablesintoCSV.
FeedHTMLwithfeed().GetCSVwithgetCSV().(Seeexamplebelow.)
AlltablesinHTMLwillbeconvertedtoCSV(intheordertheyoccur
intheHTMLfile).
YoucanprocessverylargeHTMLfilesbyfeedingthisclasswithchunks
ofhtmlwhilegettingchunksofCSVbycallinggetCSV().
Shouldhandlebadlyformatedhtml(missing<tr>,</tr>,</td>,
extraneous</td>,</tr>...).
ThisparserusesHTMLParserfromtheHTMLParsermodule,
notHTMLParserfromthehtmllibmodule.
Example:parser=html2csv()
parser.feed(open('mypage.html','rb').read())
open('mytables.csv','w+b').write(parser.getCSV())
Thisclassispublicdomain.
Author:SébastienSAUVAGE<sebsauvageatsebsauvagedotnet>
http://sebsauvage.net
Versions:
2002-09-19:-Firstversion
2002-09-20:-nowusesHTMLParser.HTMLParserinsteadofhtmllib.HTMLParser.
-nowparsescommand-line.
Todo:
-handle<PRE>tags
-converthtmlentities(&name;and&#ref;)toAscii.
'''
def__init__(self):
HTMLParser.HTMLParser.__init__(self)
self.CSV=''#TheCSVdata
self.CSVrow=''#ThecurrentCSVrowbeeingconstructedfromHTML
self.inTD=0#Usedtotrackifweareinsideoroutsidea<TD>...</TD>tag.
self.inTR=0#Usedtotrackifweareinsideoroutsidea<TR>...</TR>tag.
self.re_multiplespaces=re.compile('\s+')#regularexpressionusedtoremovespacesinexcess
self.rowCount=0#CSVoutputlinecounter.
defhandle_starttag(self,tag,attrs):
iftag=='tr':self.start_tr()
eliftag=='td':self.start_td()
defhandle_endtag(self,tag):
iftag=='tr':self.end_tr()
eliftag=='td':self.end_td()
defstart_tr(self):
ifself.inTR:self.end_tr()#<TR>implies</TR>
self.inTR=1
defend_tr(self):
ifself.inTD:self.end_td()#</TR>implies</TD>
self.inTR=0
iflen(self.CSVrow)>0:
self.CSV+=self.CSVrow[:-1]
self.CSVrow=''
self.CSV+='\n'
self.rowCount+=1
defstart_td(self):
ifnotself.inTR:self.start_tr()#<TD>implies<TR>
self.CSVrow+='"'
self.inTD=1
defend_td(self):
ifself.inTD:
self.CSVrow+='",'
self.inTD=0
defhandle_data(self,data):
ifself.inTD:
self.CSVrow+=self.re_multiplespaces.sub('',data.replace('\t','').replace('\n','').replace('\r','').replace('"','""'))
defgetCSV(self,purge=False):
'''GetoutputCSV.
Ifpurgeistrue,getCSV()willreturnallremainingdata,
evenif<td>or<tr>arenotproperlyclosed.
(YouwouldtypicallycallgetCSVwithpurge=Truewhenyoudonothave
anymoreHTMLtofeedandyoususpectdirtyHTML(unclosedtags).'''
ifpurgeandself.inTR:self.end_tr()#Thiswillalsoend_tdandappendlastCSVrowtooutputCSV.
dataout=self.CSV[:]
self.CSV=''
returndataout
if__name__=="__main__":
try:#Putgetoptinplaceforfutureusage.
opts,args=getopt.getopt(sys.argv[1:],None)
exceptgetopt.GetoptError:
printusage(sys.argv[0])#printhelpinformationandexit:
sys.exit(2)
iflen(args)==0:
printusage(sys.argv[0])#printhelpinformationandexit:
sys.exit(2)
printprogramname
html_files=glob.glob(args[0])
forhtmlfilenameinhtml_files:
outputfilename=os.path.splitext(htmlfilename)[0]+'.csv'
parser=html2csv()
print'Reading%s,writing%s...'%(htmlfilename,outputfilename)
try:
htmlfile=open(htmlfilename,'rb')
csvfile=open(outputfilename,'w+b')
data=htmlfile.read(8192)
whiledata:
parser.feed(data)
csvfile.write(parser.getCSV())
sys.stdout.write('%dCSVrowswritten.\r'%parser.rowCount)
data=htmlfile.read(8192)
csvfile.write(parser.getCSV(True))
csvfile.close()
htmlfile.close()
except:
print'Errorconverting%s'%htmlfilename
try:htmlfile.close()
except:pass
try:csvfile.close()
except:pass
print'Alldone.'
希望本文所述对大家的Python程序设计有所帮助。
python实现将html表格转换成CSV文件的方法

热门推荐

随机推荐