Python爬取读者并制作成PDF
学了下beautifulsoup后,做个个网络爬虫,爬取读者杂志并用reportlab制作成pdf..
crawler.py
#!/usr/bin/envpython #coding=utf-8 """ Author: Anemone Filename: getmain.py Lastmodified: 2015-02-1916:47 E-mail: anemone@82flex.com """ importurllib2 frombs4importBeautifulSoup importre importsys reload(sys) sys.setdefaultencoding('utf-8') defgetEachArticle(url): # response=urllib2.urlopen('http://www.52duzhe.com/2015_01/duzh20150104.html') response=urllib2.urlopen(url) html=response.read() soup=BeautifulSoup(html)#.decode("utf-8").encode("gbk")) #foriinsoup.find_all('div'): # printi,1 title=soup.find("h1").string writer=soup.find(id="pub_date").string.strip() _from=soup.find(id="media_name").string.strip() text=soup.get_text()#.encode("utf-8") main=re.split("BAIDU_CLB.*;",text) result={"title":title,"writer":writer,"from":_from,"context":main[1]} returnresult #new=open("new.txt","w") #new.write(result["title"]+"\n\n") #new.write(result["writer"]+" "+result["from"]) #new.write(result["context"]) #new.close() defgetCatalog(issue): url="http://www.52duzhe.com/"+issue[:4]+"_"+issue[-2:]+"/" firstUrl=url+"duzh"+issue+"01.html" firstUrl=url+"index.html" duzhe=dict() response=urllib2.urlopen(firstUrl) html=response.read() soup=BeautifulSoup(html) firstUrl=url+soup.table.a.get("href") response=urllib2.urlopen(firstUrl) html=response.read() soup=BeautifulSoup(html) all=soup.find_all("h2") foriinall: printi.string duzhe[i.string]=list() forlinkini.parent.find_all("a"): href=url+link.get("href") printhref while1: try: article=getEachArticle(href) break except: continue duzhe[i.string].append(article) returnduzhe defreadDuZhe(duzhe): foreachColumninduzhe: foreachArticleinduzhe[eachColumn]: printeachArticle["title"] if__name__=='__main__': # issue=raw_input("issue(201501):") readDuZhe(getCatalog("201424"))
getpdf.py
#!/usr/bin/envpython #coding=utf-8 """ Author: Anemone Filename: writetopdf.py Lastmodified: 2015-02-2019:19 E-mail: anemone@82flex.com """ #coding=utf-8 importreportlab.rl_config fromreportlab.pdfbaseimportpdfmetrics fromreportlab.pdfbase.ttfontsimportTTFont fromreportlab.libimportfonts importcopy fromreportlab.platypusimportParagraph,SimpleDocTemplate,flowables fromreportlab.lib.stylesimportgetSampleStyleSheet importcrawler defwritePDF(issue,duzhe): reportlab.rl_config.warnOnMissingFontGlyphs=0 pdfmetrics.registerFont(TTFont('song',"simsun.ttc")) pdfmetrics.registerFont(TTFont('hei',"msyh.ttc")) fonts.addMapping('song',0,0,'song') fonts.addMapping('song',0,1,'song') fonts.addMapping('song',1,0,'hei') fonts.addMapping('song',1,1,'hei') stylesheet=getSampleStyleSheet() normalStyle=copy.deepcopy(stylesheet['Normal']) normalStyle.fontName='song' normalStyle.fontSize=11 normalStyle.leading=11 normalStyle.firstLineIndent=20 titleStyle=copy.deepcopy(stylesheet['Normal']) titleStyle.fontName='song' titleStyle.fontSize=15 titleStyle.leading=20 firstTitleStyle=copy.deepcopy(stylesheet['Normal']) firstTitleStyle.fontName='song' firstTitleStyle.fontSize=20 firstTitleStyle.leading=20 firstTitleStyle.firstLineIndent=50 smallStyle=copy.deepcopy(stylesheet['Normal']) smallStyle.fontName='song' smallStyle.fontSize=8 smallStyle.leading=8 story=[] story.append(Paragraph("<b>读者{0}期</b>".format(issue),firstTitleStyle)) foreachColumninduzhe: story.append(Paragraph('__'*28,titleStyle)) story.append(Paragraph('<b>{0}</b>'.format(eachColumn),titleStyle)) foreachArticleinduzhe[eachColumn]: story.append(Paragraph(eachArticle["title"],normalStyle)) story.append(flowables.PageBreak()) foreachColumninduzhe: foreachArticleinduzhe[eachColumn]: story.append(Paragraph("<b>{0}</b>".format(eachArticle["title"]),titleStyle)) story.append(Paragraph("{0} {1}".format(eachArticle["writer"],eachArticle["from"]),smallStyle)) para=eachArticle["context"].split("") foreachParainpara: story.append(Paragraph(eachPara,normalStyle)) story.append(flowables.PageBreak()) #story.append(Paragraph("context",normalStyle)) doc=SimpleDocTemplate("duzhe"+issue+".pdf") print"WritingPDF..." doc.build(story) defmain(issue): duzhe=crawler.getCatalog(issue) writePDF(issue,duzhe) if__name__=='__main__': issue=raw_input("Enterissue(201501):") main(issue)
以上就是本文的全部内容了,希望大家能够喜欢。