python手机号前7位归属地爬虫代码实例
需求分析
项目上需要用到手机号前7位,判断号码是否合法,还有归属地查询。旧的数据是几年前了太久了,打算用python爬虫重新爬一份
单线程版本
#coding:utf-8
importrequests
fromdatetimeimportdatetime
classPhoneInfoSpider:
def__init__(self,phoneSections):
self.phoneSections=phoneSections
defphoneInfoHandler(self,textData):
text=textData.splitlines(True)
#print("textlength:"+str(len(text)))
iflen(text)>=9:
number=text[1].split('\'')[1]
province=text[2].split('\'')[1]
mobile_area=text[3].split('\'')[1]
postcode=text[5].split('\'')[1]
line="number:"+number+",province:"+province+",mobile_area:"+mobile_area+",postcode:"+postcode
line_text=number+","+province+","+mobile_area+","+postcode
print(line_text)
#print("province:"+province)
try:
f=open('./result.txt','a')
f.write(str(line_text)+'\n')
exceptExceptionase:
print(Exception,":",e)
defrequestPhoneInfo(self,phoneNum):
try:
url='https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel='+phoneNum
response=requests.get(url)
self.phoneInfoHandler(response.text)
exceptExceptionase:
print(Exception,":",e)
defrequestAllSections(self):
#last用于接上次异常退出前的号码
last=0
#last=4
#自动生成手机号码,后四位补0
forheadinself.phoneSections:
head_begin=datetime.now()
print(head+"begintime:"+str(head_begin))
#foriinrange(last,10000):
foriinrange(last,10):
middle=str(i).zfill(4)
phoneNum=head+middle+"0000"
self.requestPhoneInfo(phoneNum)
last=0
head_end=datetime.now()
print(head+"endtime:"+str(head_end))
if__name__=='__main__':
task_begin=datetime.now()
print("phonecheckbegintime:"+str(task_begin))
#电信,联通,移动,虚拟运营商
dx=['133','149','153','173','177','180','181','189','199']
lt=['130','131','132','145','146','155','156','166','171','175','176','185','186','166']
yd=['134','135','136','137','138','139','147','148','150','151','152','157','158','159','172',
'178','182','183','184','187','188','198']
add=['170']
all_num=dx+lt+yd+add
#print(all_num)
print(len(all_num))
#要爬的号码段
spider=PhoneInfoSpider(all_num)
spider.requestAllSections()
task_end=datetime.now()
print("phonecheckendtime:"+str(task_end))
发现爬取一个号段,共10000次查询,单线程版大概要多1个半小时,太慢了。
多线程版本
#coding:utf-8
importrequests
fromdatetimeimportdatetime
importqueue
importthreading
threadNum=32
classMyThread(threading.Thread):
def__init__(self,func):
threading.Thread.__init__(self)
self.func=func
defrun(self):
self.func()
defrequestPhoneInfo():
globallock
whileTrue:
lock.acquire()
ifq.qsize()!=0:
print("queuesize:"+str(q.qsize()))
p=q.get()#获得任务
lock.release()
middle=str(9999-q.qsize()).zfill(4)
phoneNum=phone_head+middle+"0000"
print("phoneNum:"+phoneNum)
try:
url='https://tcc.taobao.com/cc/json/mobile_tel_segment.htm?tel='+phoneNum
#print(url)
response=requests.get(url)
#print(response.text)
phoneInfoHandler(response.text)
exceptExceptionase:
print(Exception,":",e)
else:
lock.release()
break
defphoneInfoHandler(textData):
text=textData.splitlines(True)
iflen(text)>=9:
number=text[1].split('\'')[1]
province=text[2].split('\'')[1]
mobile_area=text[3].split('\'')[1]
postcode=text[5].split('\'')[1]
line="number:"+number+",province:"+province+",mobile_area:"+mobile_area+",postcode:"+postcode
line_text=number+","+province+","+mobile_area+","+postcode
print(line_text)
#print("province:"+province)
try:
f=open('./result.txt','a')
f.write(str(line_text)+'\n')
exceptExceptionase:
print(Exception,":",e)
if__name__=='__main__':
task_begin=datetime.now()
print("phonecheckbegintime:"+str(task_begin))
dx=['133','149','153','173','177','180','181','189','199']
lt=['130','131','132','145','155','156','166','171','175','176','185','186','166']
yd=['134','135','136','137','138','139','147','150','151','152','157','158','159','172','178',
'182','183','184','187','188','198']
all_num=dx+lt+yd
print(len(all_num))
forheadinall_num:
head_begin=datetime.now()
print(head+"begintime:"+str(head_begin))
q=queue.Queue()
threads=[]
lock=threading.Lock()
forpinrange(10000):
q.put(p+1)
print(q.qsize())
foriinrange(threadNum):
middle=str(i).zfill(4)
globalphone_head
phone_head=head
thread=MyThread(requestPhoneInfo)
thread.start()
threads.append(thread)
forthreadinthreads:
thread.join()
head_end=datetime.now()
print(head+"endtime:"+str(head_end))
task_end=datetime.now()
print("phonecheckendtime:"+str(task_end))
多线程版的1个号码段1000条数据,大概2,3min就好,cpu使用飙升,大概维持在70%左右。
总共40多个号段,爬完大概1,2个小时,总数据41w左右
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。