Python通过解析网页实现看报程序的方法
本文所述实例可以实现基于Python的查看图片报纸《参考消息》并将当天的图片报纸自动下载到本地供查看的功能,具体实现代码如下:
#coding=gbk importurllib2 importsocket importre importtime importos #timeoutinseconds #timeout=10 #socket.setdefaulttimeout(timeout) timeout=10 urllib2.socket.setdefaulttimeout(timeout) home_url="http://www.hqck.net" home_page="" try: home_page_context=urllib2.urlopen(home_url) home_page=home_page_context.read() print"Readhomepagefinishd." print"-------------------------------------------------" excepturllib2.URLError,e: printe.code exit() except: printe.code exit() reg_str=r'<aclass="item-baozhi"href="/arc/jwbt/ckxx/\d{4}/\d{4}/\w+\.html"rel="externalnofollow"><spanclass.+>.+</span></a>' news_url_reg=re.compile(reg_str) today_cankao_news=news_url_reg.findall(home_page) iflen(today_cankao_news)==0: print"Cannotfindtoday'snews!" exit() my_news=today_cankao_news[0] print"Latestnewslink="+my_news print url_s=my_news.find("/arc/") url_e=my_news.find(".html") url_e=url_e+5 print"Linkindex=["+str(url_s)+","+str(url_e)+"]" my_news=my_news[url_s:url_e] print"parturl="+my_news full_news_url=home_url+my_news print"fullurl="+full_news_url print image_folder="E:\\new_folder\\" if(os.path.exists(image_folder)==False): os.makedirs(image_folder) today_num=time.strftime('%Y-%m-%d',time.localtime(time.time())) image_folder=image_folder+today_num+"\\" if(os.path.exists(image_folder)==False): os.makedirs(image_folder) print"Newsimagefolder="+image_folder print context_uri=full_news_url[0:-5] first_page_url=context_uri+".html" try: first_page_context=urllib2.urlopen(first_page_url) first_page=first_page_context.read() excepturllib2.HTTPError,e: printe.code exit() tot_page_index=first_page.find("共") tot_page_index=tot_page_index tmp_str=first_page[tot_page_index:tot_page_index+10] end_s=tmp_str.find("页") page_num=tmp_str[2:end_s] printpage_num page_count=int(page_num) print"Total"+page_num+"pages:" print page_index=1 download_suc=True whilepage_index<=page_count: page_url=context_uri ifpage_index>1: page_url=page_url+"_"+str(page_index) page_url=page_url+".html" print"Newspagelink="+page_url try: news_img_page_context=urllib2.urlopen(page_url) excepturllib2.URLError,e: printe.reason download_suc=False break news_img_page=news_img_page_context.read() #f=open("e:\\page.html","w") #f.write(news_img_page) #f.close() reg_str=r'http://image\S+jpg' image_reg=re.compile(reg_str) image_results=image_reg.findall(news_img_page) iflen(image_results)==0: print"Cannotfindnewspage"+str(page_index)+"!" download_suc=False break image_url=image_results[0] print"Newsimageurl="+image_url news_image_context=urllib2.urlopen(image_url) image_name=image_folder+"page_"+str(page_index)+".jpg" imgf=open(image_name,'wb') print"Gettingimage..." try: whileTrue: date=news_image_context.read(1024*10) ifnotdate: break imgf.write(date) imgf.close() except: download_suc=False print"Saveimage"+str(page_index)+"failed!" print"Unexpectederror:"+sys.exc_info()[0]+sys.exc_info()[1] else: print"Saveimage"+str(page_index)+"succeed!" print page_index=page_index+1 ifdownload_suc==True: print"Newsdownloadsucceed!Path=\""+str(image_folder)+"\"" print"Enjoyit!^^" else: print"newsdownloadfailed!"