如何基于线程池提升request模块效率
普通方法:爬取梨视频
importre
importtime
importrandom
importrequests
fromlxmlimportetree
start_time=time.time()
url="https://www.pearvideo.com/category_3"
headers={
"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/79.0.3945.88Safari/537.36"
}
ex='srcUrl="(.*?)",vdoUrl=srcUrl'
defrequest_video(url):
"""
向视频链接发送请求
"""
returnrequests.get(url=url,headers=headers).content
defsave_video(content):
"""
将视频的二进制数据保存到本地
"""
video_name=str(random.randint(100,999))+".mp4"
withopen(video_name,'wb')asf:
f.write(content)
#获取首页源码
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
li_list=tree.xpath('//ul[@class="listvideo-listclearfix"]/li')
video_url_list=list()
forliinli_list:
detail_url="https://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
#获取该视频页面的源码
detail_page_text=requests.get(url=detail_url,headers=headers).text
#正则匹配视频的URL
video_url=re.findall(ex,detail_page_text,re.S)[0]
video_url_list.append(video_url)
content=request_video(video_url)
save_video(content)
print("执行耗时:",time.time()-start_time)
执行耗时:147.22410440444946
使用线程池:爬取梨视频
#使用线程池爬去梨视频的
importre
importtime
importrandom
importrequests
fromlxmlimportetree
frommultiprocessing.dummyimportPool
start_time=time.time()
url="https://www.pearvideo.com/category_3"
headers={
"User-Agent":"Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/79.0.3945.88Safari/537.36"
}
ex='srcUrl="(.*?)",vdoUrl=srcUrl'
defrequest_video(url):
"""
向视频链接发送请求
"""
returnrequests.get(url=url,headers=headers).content
defsave_video(content):
"""
将视频的二进制数据保存到本地
"""
video_name=str(random.randint(100,999))+".mp4"
withopen(video_name,'wb')asf:
f.write(content)
#获取首页源码
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
li_list=tree.xpath('//ul[@class="listvideo-listclearfix"]/li')
video_url_list=list()
forliinli_list:
detail_url="https://www.pearvideo.com/"+li.xpath('./div/a/@href')[0]
#获取该视频页面的源码
detail_page_text=requests.get(url=detail_url,headers=headers).text
#正则匹配视频的URL
video_url=re.findall(ex,detail_page_text,re.S)[0]
video_url_list.append(video_url)
pool=Pool(4)
#使用线程池将视频的二进制数据下载下来
content_list=pool.map(request_video,video_url_list)
#使用线程池将视频的二进制数据保存到本地
pool.map(save_video,content_list)
print("执行耗时:",time.time()-start_time)
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。