Python 微信爬虫完整实例【单线程与多线程】
本文实例讲述了Python实现的微信爬虫。分享给大家供大家参考,具体如下:
单线程版:
importurllib.request importurllib.parse importurllib.error importre,time headers=("User-Agent", "Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/60.0.3107.4Safari/537.36") operner=urllib.request.build_opener() operner.addheaders=[headers] urllib.request.install_opener(operner) list_url=[] ###使用代理获取网页url内容 defuse_proxy(url): try: #proxy=urllib.request.ProxyHandler({'http':proxy_addr})##使用代理版 #operner=urllib.request.build_opener() #urllib.request.install_opener(operner) headers=("User-Agent", "Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/60.0.3107.4Safari/537.36") operner=urllib.request.build_opener() operner.addheaders=[headers] urllib.request.install_opener(operner) data=urllib.request.urlopen(url).read().decode('utf-8') #print(data) returndata excepturllib.error.URLErrorase: ifhasattr(e,"code"): print(e.code) elifhasattr(e,"reason"): print(e.reason) exceptExceptionase: print("exception"+str(e)) time.sleep(1) ##获取要爬取的url defget_url(key,pagestart,pageend): try: keycode=urllib.parse.quote(key) forpageinrange(pagestart,pageend+1): url="http://weixin.sogou.com/weixin?query=%s&_sug_type_=&s_from=input&_sug_=n&type=%d&page=1&ie=utf8"%( keycode,page) data1=use_proxy(url) #print("data1的内容是",data1) listurl_pattern='.*?("http://.*?)
' result=re.compile(listurl_pattern,re.S).findall(data1) foriinrange(len(result)): res=result[i].replace("amp;","").split("")[0].replace("\"","") list_url.append(res) #print(list_url) returnlist_url excepturllib.error.URLErrorase: ifhasattr(e,"code"): print(e.code) elifhasattr(e,"reason"): print(e.reason) exceptExceptionase: print("exception:",e) ##通过获取的url爬行内容数据并处理 defget_url_content(list_url): fh1=open("D:\\python-script\\1.html",'wb') html1='''\n\n\n \n 微信文章 \n''' fh1.write(html1.encode("utf-8")) fh1.close() fh=open("D:\\python-script\\1.html",'ab') forurlinlist_url: data_content=use_proxy(url) #print(data_content) #sys.exit() title_pattern='.*?' result_title=re.compile(title_pattern,re.S).findall(data_content) ##标题(str) res_title=result_title[0].replace(" ","").replace("", "").strip() content_pattern='id="js_content">(.*?) ' content=re.compile(content_pattern,re.S).findall(data_content) try: fh.write(res_title.encode("utf-8")) foriincontent: fh.write(i.strip().encode("utf-8")) exceptUnicodeEncodeErrorase: continue fh.write("