尝试使用Python多线程抓取代理服务器IP地址的示例
这里以抓取http://www.proxy.com.ru站点的代理服务器为例,代码如下:
#!/usr/bin/envpython #coding:utf-8 importurllib2 importre importthreading importtime importMySQLdb rawProxyList=[] checkedProxyList=[] #抓取代理网站 targets=[] foriinxrange(1,42): target=r"http://www.proxy.com.ru/list_%d.html"%i targets.append(target) #抓取代理服务器正则 p=re.compile(r'''<tr><b><td>(\d+)</td><td>(.+?)</td><td>(\d+)</td><td>(.+?)</td><td>(.+?)</td></b></tr>''') #获取代理的类 classProxyGet(threading.Thread): def__init__(self,target): threading.Thread.__init__(self) self.target=target defgetProxy(self): print"代理服务器目标网站:"+self.target req=urllib2.urlopen(self.target) result=req.read() #printchardet.detect(result) matchs=p.findall(result) #printmatchs forrowinmatchs: ip=row[1] port=row[2] addr=row[4].decode("cp936").encode("utf-8") proxy=[ip,port,addr] printproxy rawProxyList.append(proxy) defrun(self): self.getProxy() #检验代理的类 classProxyCheck(threading.Thread): def__init__(self,proxyList): threading.Thread.__init__(self) self.proxyList=proxyList self.timeout=5 self.testUrl="http://www.baidu.com/" self.testStr="030173" defcheckProxy(self): cookies=urllib2.HTTPCookieProcessor() forproxyinself.proxyList: proxyHandler=urllib2.ProxyHandler({"http":r'http://%s:%s'%(proxy[0],proxy[1])}) #printr'http://%s:%s'%(proxy[0],proxy[1]) opener=urllib2.build_opener(cookies,proxyHandler) opener.addheaders=[('User-agent','Mozilla/5.0(WindowsNT6.2;WOW64;rv:22.0)Gecko/20100101Firefox/22.0')] #urllib2.install_opener(opener) t1=time.time() try: #req=urllib2.urlopen("http://www.baidu.com",timeout=self.timeout) req=opener.open(self.testUrl,timeout=self.timeout) #print"urlopenisok...." result=req.read() #print"readhtml...." timeused=time.time()-t1 pos=result.find(self.testStr) #print"posis%s"%pos ifpos>1: checkedProxyList.append((proxy[0],proxy[1],proxy[2],timeused)) #print"okip:%s%s%s%s"%(proxy[0],proxy[1],proxy[2],timeused) else: continue exceptException,e: #printe.message continue defrun(self): self.checkProxy() if__name__=="__main__": getThreads=[] checkThreads=[] #对每个目标网站开启一个线程负责抓取代理 foriinrange(len(targets)): t=ProxyGet(targets[i]) getThreads.append(t) foriinrange(len(getThreads)): getThreads[i].start() foriinrange(len(getThreads)): getThreads[i].join() print'.'*10+"总共抓取了%s个代理"%len(rawProxyList)+'.'*10 #开启20个线程负责校验,将抓取到的代理分成20份,每个线程校验一份 foriinrange(20): t=ProxyCheck(rawProxyList[((len(rawProxyList)+19)/20)*i:((len(rawProxyList)+19)/20)*(i+1)]) checkThreads.append(t) foriinrange(len(checkThreads)): checkThreads[i].start() foriinrange(len(checkThreads)): checkThreads[i].join() print'.'*10+"总共有%s个代理通过校验"%len(checkedProxyList)+'.'*10 #插入数据库,表结构自己创建,四个字段ip,port,speed,address defdb_insert(insert_list): try: conn=MySQLdb.connect(host="localhost",user="root",passwd="admin",db="m_common",charset='utf8') cursor=conn.cursor() cursor.execute('deletefromproxy') cursor.execute('altertableproxyAUTO_INCREMENT=1') cursor.executemany("INSERTINTOproxy(ip,port,speed,address)VALUES(%s,%s,%s,%s)",insert_list) conn.commit() cursor.close() conn.close() exceptMySQLdb.Error,e: print"MysqlError%d:%s"%(e.args[0],e.args[1]) #代理排序持久化 proxy_ok=[] f=open("proxy_list.txt",'w+') forproxyinsorted(checkedProxyList,cmp=lambdax,y:cmp(x[3],y[3])): ifproxy[3]<8: #print"checkedproxyis:%s:%s\t%s\t%s"%(proxy[0],proxy[1],proxy[2],proxy[3]) proxy_ok.append((proxy[0],proxy[1],proxy[3],proxy[2])) f.write("%s:%s\t%s\t%s\n"%(proxy[0],proxy[1],proxy[2],proxy[3])) f.close() db_insert(proxy_ok)
测试:
pythonproxy.py
结果如下:
['61.58.94.179','8088','\xe5\x8f\xb0\xe6\xb9\xbe\xe7\x9c\x81\xe5\x8f\xb0\xe6\xb9\xbe\xe5\xae\xbd\xe9\xa2\x91\xe9\x80\x9a\xe8\xae\xaf\xe9\xa1\xbe\xe9\x97\xae\xe8\x82\xa1\xe4\xbb\xbd\xe6\x9c\x89\xe9\x99\x90\xe5\x85\xac\xe5\x8f\xb8'] ['200.84.116.99','9064','\xe5\xa7\x94\xe5\x86\x85\xe7\x91\x9e\xe6\x8b\x89'] ['183.223.204.8','8123','\xe5\x9b\x9b\xe5\xb7\x9d\xe7\x9c\x81\xe8\x87\xaa\xe8\xb4\xa1\xe5\xb8\x82\xe7\xa7\xbb\xe5\x8a\xa8'] ..........总共抓取了1921个代理.......... ..........总共有524个代理通过校验.......... #moreproxy_list.txt 202.106.169.142:80北京市联通ADSL0.291432857513 111.13.136.59:80北京市移动0.297957897186 111.13.136.56:80北京市移动0.373070955276 111.206.81.248:80北京市联通0.403017997742 111.13.136.58:80北京市移动0.414332151413 124.202.217.134:8118北京市电信通0.416817903519 124.202.183.218:8118北京市电信通0.426618099213 120.132.71.232:80北京市联通0.440200090408 61.232.6.164:8081北京市铁通0.469615936279 118.144.96.253:80北京市电信通0.485229969025 203.192.10.66:80北京市新华社0.51485991478 124.202.182.22:8118北京市电信通0.553130865097
数据库:
mysql>select*fromm_common.proxylimit10;
+----------+-----------------+------+----------+----------------------+---------------------+ |proxy_id|ip|port|speed|address|create_time| +----------+-----------------+------+----------+----------------------+---------------------+ |1|202.106.169.142|80|0.291433|北京市联通ADSL|2015-02-2611:29:24| |2|111.13.136.59|80|0.297958|北京市移动|2015-02-2611:29:24| |3|111.13.136.56|80|0.373071|北京市移动|2015-02-2611:29:24| |4|111.206.81.248|80|0.403018|北京市联通|2015-02-2611:29:24| |5|111.13.136.58|80|0.414332|北京市移动|2015-02-2611:29:24| |6|124.202.217.134|8118|0.416818|北京市电信通|2015-02-2611:29:24| |7|124.202.183.218|8118|0.426618|北京市电信通|2015-02-2611:29:24| |8|120.132.71.232|80|0.4402|北京市联通|2015-02-2611:29:24| |9|61.232.6.164|8081|0.469616|北京市铁通|2015-02-2611:29:24| |10|118.144.96.253|80|0.48523|北京市电信通|2015-02-2611:29:24| +----------+-----------------+------+----------+----------------------+---------------------+ 10rowsinset(0.00sec)