Python爬取国外天气预报网站的方法
本文实例讲述了Python爬取国外天气预报网站的方法。分享给大家供大家参考。具体如下:
crawl_weather.py如下:
#encoding=utf-8 importhttplib importurllib2 importtime fromthreadingimportThread importthreading fromQueueimportQueue fromtimeimportsleep importre importcopy lang="fr" count=0 classLocation: #Location(False,"中国","北京","zh") #Location(True,"","亚洲","zh") def__init__(self,is_beyond_country,country_name,loc_name,lang): self.country_name=country_name self.loc_name=loc_name self.lang=lang self.is_beyond_country=is_beyond_country prn_lock=threading.RLock() defGetLocationURLs(url,recursive): globalcount ifurl.find("weather-forecast")!=-1: count=count+1 ifcount%500==0: prn_lock.acquire() print"count:%d"%(count) prn_lock.release() return[url] page=urllib2.urlopen(url).read() time.sleep(0.01) #"<h6><ahref=\"http://www.accuweather.com/zh/browse-locations/afr\"><em>Africa</em></a></h6>" pattern="<h6><ahref=\"(.*)\"><em>(.*)</em></a></h6>" locs=re.findall(pattern,page) locs=[(url,name)forurl,nameinlocsifurl.find("browse-locations")!=-1orurl.find("weather-forecast")!=-1] ifnotrecursive: urls=[urlforurl,nameinlocs] returnurls urls=[] for_url,_nameinlocs: lst=GetLocationURLs(_url,True) urls.extend(lst) returnurls #entry_url="http://www.accuweather.com/zh/browse-locations" entry_url="http://www.accuweather.com/%s/browse-locations/eur/fr"%(lang) #regions=["afr","ant","arc","asi","cac","eur","mea","nam","ocn","sam"] #regions=["eur"] #region_urls=["%s/%s"%(entry_url,reg)forreginregions] #region_urls=["http://www.accuweather.com/zh/browse-locations/eur/fr"] sub_urls=GetLocationURLs(entry_url,False) printlen(sub_urls) printsub_urls q=Queue() location_urls=[] ThreadNum=5 lock=threading.RLock() forurlinsub_urls: q.put(url) defworking(): whileTrue: url=q.get() lst=GetLocationURLs(url,True) print"%s%durls"%(url,len(lst)) lock.acquire() location_urls.extend(lst) lock.release() q.task_done() foriinrange(ThreadNum): t=Thread(target=working) t.setDaemon(True) t.start() q.join() fp=open('locations.txt',"w") fp.write("\n".join(location_urls)) fp.close() #forurlinlocation_urls: #printurl #location_urls=GetLocationURLs(entry_url) ''' defFetch(url): try: printurl web_path=url[0] local_name=url[1] print"web_path:",web_path print"local_name:",local_name sContent=urllib2.urlopen(web_path).read() savePath="D:\\Course\\NLP_Manning\\%s"%(local_name) printsavePath file=open(savePath,'wb') file.write(sContent) file.close() printsavePath+"saved"; except: pass; defworking(): whileTrue: url=q.get() Fetch(url) sleep(10) q.task_done() #root_url="https://class.coursera.org/nlp/lecture/index?lecture_player=flash" root_url="https://class.coursera.org/nlp/lecture/index?lecture_player=flash" page=urllib2.urlopen(root_url).read() foriinrange(NUM): t=Thread(target=working) t.setDaemon(True) t.start() urls=copy.deepcopy(ppt_urls) urls.extend(srt_urls) urls.extend(video_urls) printlen(ppt_urls) printlen(srt_urls) printlen(video_urls) printlen(urls) forurlinurls: q.put(url) q.join() ''' ''' root_url="http://www.accuweather.com/zh/cn/andingmen/57494/weather-forecast/57494" page=urllib2.urlopen(root_url).read() printpage '''
FetchLocation.py如下:
#encoding=utf-8 importsys importhttplib importurllib2 importtime fromthreadingimportThread importthreading fromQueueimportQueue fromtimeimportsleep importre importcopy fromxml.domimportminidom importHTMLParser importdatetime q=Queue() locks=[threading.RLock()foriinrange(2)] ThreadNumber=20 locations={} conds={} defFindCountryBreadCrumbs(page): lines=page.splitlines() count=0 start=-1 opened=False forlineinlines: ifline.find("<ulid=\"country-breadcrumbs\">")!=-1: start=count opened=True ifopenedandline.find("</ul>")!=-1: end=count opened=False count=count+1 return"\n".join(lines[start:(end+1)]) defGetText(nodelist): rc=[] fornodeinnodelist: ifnode.nodeType==node.TEXT_NODE: rc.append(HTMLParser.HTMLParser().unescape(node.data)) return''.join(rc) defFindCondition(page): pat="<spanclass=\"cond\">(.*?)</span>" cds=re.findall(pat,page) cds=[HTMLParser.HTMLParser().unescape(cd).encode("utf-8")forcdincds] returncds defExtractInfo(url): try: page=urllib2.urlopen(url).read() exceptException,e: return[] text=FindCountryBreadCrumbs(page) text=HTMLParser.HTMLParser().unescape(text) dom=minidom.parseString(text.encode("utf-8")) locs=[] lis=dom.getElementsByTagName("li") forliinlis: adr_list=li.getElementsByTagName("a") ifadr_list: locs.append(GetText(adr_list[0].childNodes).encode("utf-8")) strs=li.getElementsByTagName("strong") ifstrs: locs.append(GetText(strs[0].childNodes).encode("utf-8")) cds=FindCondition(page) returnlocs,cds defAddMap(lst,m): forxinlst: ifm.get(x)==None: m[x]=1 defworking(): whileTrue: urls=q.get() #printlen(urls) m={} m2={} count=0 forurlinurls: count=count+1 #print"%d/%d"%(count,len(urls)) locs,cds=ExtractInfo(url) AddMap(locs,m) AddMap(cds,m2) locks[1].acquire() AddMap(m.keys(),locations) AddMap(m2.keys(),conds) locks[1].release() q.task_done() defmain(): iflen(sys.argv)<2: exit() loc_path=sys.argv[1] fp=open(loc_path,"r") urls=[line.strip()forlineinfp] fp.close() #urls=urls[0:1000] blocks=len(urls)/ThreadNumber+1 forstartinrange(0,len(urls),blocks): end=start+blocks ifend>len(urls): end=len(urls) q.put(urls[start:end]) foriinrange(ThreadNumber): t=Thread(target=working) t.setDaemon(True) t.start() q.join() fp=open("location_name.fr","w") fp.write("\n".join(locations.keys())) fp.close() fp=open("conditions.fr","w") fp.write("\n".join(conds.keys())) fp.close() if__name__=='__main__': main()
希望本文所述对大家的python程序设计有所帮助。