爬虫代理池Python3WebSpider源代码测试过程解析
这篇文章主要介绍了爬虫代理池Python3WebSpider源代码测试过程解析,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友可以参考下
元类属性的使用
代码
主要关于元类的使用
通过获取由元类生成的爬虫抓取类的部分属性.这里为抓取函数,以相同的字符开头的抓取函数,生成属性列表,这样可以持续调用.目的是可以仅仅添加不同的抓取函数抓取不同的网站,而类的其他部分不用做调整.
部分代码:
classProxyMetaclass(type): def__new__(cls,name,bases,attrs): count=0 attrs['__CrawlFunc__']=[] fork,vinattrs.items(): if'crawl_'ink: attrs['__CrawlFunc__'].append(k) count+=1 attrs['__CrawlFuncCount__']=count returntype.__new__(cls,name,bases,attrs) classCrawler(object,metaclass=ProxyMetaclass): defget_proxies(self,callback): proxies=[] forproxyineval("self.{}()".format(callback)): print('成功获取到代理',proxy) proxies.append(proxy) returnproxies defcrawl_daili66(self,page_count=4): """ 获取代理66 :parampage_count:页码 :return:代理 """ start_url='http://www.66ip.cn/{}.html' urls=[start_url.format(page)forpageinrange(1,page_count+1)] forurlinurls: print('Crawling',url) html=get_page(url) ifhtml: doc=pq(html) trs=doc('.containerboxtabletr:gt(0)').items() fortrintrs: ip=tr.find('td:nth-child(1)').text() port=tr.find('td:nth-child(2)').text() yield':'.join([ip,port])
测试方法
#!/usr/bin/envpython #-*-coding:utf-8-*- #@Time:12/19/194:10PM #@Author:yon #@Email:@qq.com #@File:test importjson importre frompyqueryimportPyQueryaspq classProxyMetaclass(type): def__new__(cls,name,bases,attrs): count=0 attrs['__CrawlFunc__']=[] fork,vinattrs.items(): print("打印k") print(k) print("打印v") print(v) if'crawl_'ink: attrs['__CrawlFunc__'].append(k) count+=1 attrs['__CrawlFuncCount__']=count returntype.__new__(cls,name,bases,attrs) classCrawler(object,metaclass=ProxyMetaclass): defget_proxies(self,callback): proxies=[] forproxyineval("self.{}()".format(callback)): print('成功获取到代理',proxy) proxies.append(proxy) returnproxies defcrawl_daili66(self,page_count=4): """ 获取代理66 :parampage_count:页码 :return:代理 """ start_url='http://www.66ip.cn/{}.html' urls=[start_url.format(page)forpageinrange(1,page_count+1)] forurlinurls: print('Crawling',url) html=get_page(url) ifhtml: doc=pq(html) trs=doc('.containerboxtabletr:gt(0)').items() fortrintrs: ip=tr.find('td:nth-child(1)').text() port=tr.find('td:nth-child(2)').text() yield':'.join([ip,port]) defcrawl_ip3366(self): forpageinrange(1,4): start_url='http://www.ip3366.net/free/?stype=1&page={}'.format(page) html=get_page(start_url) ip_address=re.compile('\s* ',re.S) trs=find_trs.findall(html) fortrintrs: find_ip=re.compile('(.*?) \s*(.*?) ') #\s*匹配空格,起到换行作用 re_ip_address=ip_address.findall(html) foraddress,portinre_ip_address: result=address+':'+port yieldresult.replace('','') defcrawl_kuaidaili(self): foriinrange(1,4): start_url='http://www.kuaidaili.com/free/inha/{}/'.format(i) html=get_page(start_url) ifhtml: ip_address=re.compile('(.*?)') re_ip_address=ip_address.findall(html) port=re.compile(' (.*?)') re_port=port.findall(html) foraddress,portinzip(re_ip_address,re_port): address_port=address+':'+port yieldaddress_port.replace('','') defcrawl_xicidaili(self): foriinrange(1,3): start_url='http://www.xicidaili.com/nn/{}'.format(i) headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie':'_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host':'www.xicidaili.com', 'Referer':'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests':'1', } html=get_page(start_url,options=headers) ifhtml: find_trs=re.compile(' (.*?) (\d+\.\d+\.\d+\.\d+) ') re_ip_address=find_ip.findall(tr) find_port=re.compile('(\d+) ') re_port=find_port.findall(tr) foraddress,portinzip(re_ip_address,re_port): address_port=address+':'+port yieldaddress_port.replace('','') defcrawl_ip3366(self): foriinrange(1,4): start_url='http://www.ip3366.net/?stype=1&page={}'.format(i) html=get_page(start_url) ifhtml: find_tr=re.compile('(.*?) ',re.S) trs=find_tr.findall(html) forsinrange(1,len(trs)): find_ip=re.compile('(\d+\.\d+\.\d+\.\d+) ') re_ip_address=find_ip.findall(trs[s]) find_port=re.compile('(\d+) ') re_port=find_port.findall(trs[s]) foraddress,portinzip(re_ip_address,re_port): address_port=address+':'+port yieldaddress_port.replace('','') defcrawl_iphai(self): start_url='http://www.iphai.com/' html=get_page(start_url) ifhtml: find_tr=re.compile('(.*?) ',re.S) trs=find_tr.findall(html) forsinrange(1,len(trs)): find_ip=re.compile('\s+(\d+\.\d+\.\d+\.\d+)\s+ ',re.S) re_ip_address=find_ip.findall(trs[s]) find_port=re.compile('\s+(\d+)\s+ ',re.S) re_port=find_port.findall(trs[s]) foraddress,portinzip(re_ip_address,re_port): address_port=address+':'+port yieldaddress_port.replace('','') defcrawl_data5u(self): start_url='http://www.data5u.com/free/gngn/index.shtml' headers={ 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding':'gzip,deflate', 'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Cookie':'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host':'www.data5u.com', 'Referer':'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_13_1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/63.0.3239.108Safari/537.36', } html=get_page(start_url,options=headers) ifhtml: ip_address=re.compile('(\d+\.\d+\.\d+\.\d+) .*?(\d+)',re.S) re_ip_address=ip_address.findall(html) foraddress,portinre_ip_address: result=address+':'+port yieldresult.replace('','') classGetter(): def__init__(self): self.crawler=Crawler() defrun(self): print('获取器开始执行') forcallback_labelinrange(self.crawler.__CrawlFuncCount__): print(callback_label) callback=self.crawler.__CrawlFunc__[callback_label] print(callback) ##获取代理 #proxies=self.crawler.get_proxies(callback) #sys.stdout.flush() #forproxyinproxies: #self.redis.add(proxy) if__name__=='__main__': get=Getter() get.run()
测试结果
/home/baixiaoxu/PycharmProjects/pytthon-tt/venv/bin/python/home/baixiaoxu/PycharmProjects/pytthon-tt/proxypool/test.py 打印k __module__ 打印v __main__ 打印k __qualname__ 打印v Crawler 打印k get_proxies 打印v打印k crawl_daili66 打印v 打印k crawl_ip3366 打印v 打印k crawl_kuaidaili 打印v 打印k crawl_xicidaili 打印v 打印k crawl_iphai 打印v 打印k crawl_data5u 打印v 打印k __CrawlFunc__ 打印v ['crawl_daili66','crawl_ip3366','crawl_kuaidaili','crawl_xicidaili','crawl_iphai','crawl_data5u'] 获取器开始执行 0 crawl_daili66 1 crawl_ip3366 2 crawl_kuaidaili 3 crawl_xicidaili 4 crawl_iphai 5 crawl_data5u 进程完成,退出码0
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。