python实现爬取图书封面
本文实例为大家分享了python实现爬取图书封面的具体代码,供大家参考,具体内容如下
kongfuzi.py
利用更换代理ip,延迟提交数据,设置请求头破解网站的反爬虫机制
importrequests importrandom importtime classDownLoad(): def__init__(self): self.ip_list=['191.33.179.242:8080','122.72.108.53:80','93.190.142.214:80','189.8.88.125:65301', '36.66.55.181:8080','170.84.102.5:8080','177.200.72.214:20183','115.229.115.190:9000'] self.user_agent_list=[ 'User-Agent:Mozilla/5.0(Macintosh;U;IntelMacOSX10_6_8;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50', 'User-Agent:Mozilla/5.0(Windows;U;WindowsNT6.1;en-us)AppleWebKit/534.50(KHTML,likeGecko)Version/5.1Safari/534.50', 'User-Agent:Mozilla/5.0(Macintosh;IntelMacOSX10_7_0)AppleWebKit/535.11(KHTML,likeGecko)Chrome/17.0.963.56Safari/535.11' ] defget(self,url,proxy=None,timeout=20,num=5): print("正在请求%s"%url) UA=random.choice(self.user_agent_list) headers={'User-Agent':UA} ifproxy==None: try: returnrequests.get(url,headers=headers,timeout=timeout) except: ifnum>0: time.sleep(10) returnself.get(url,num=num-1) else: time.sleep(10) IP=''.join(random.choice(self.ip_list).strip()) proxy={'http':IP} returnself.get(url,proxy=proxy,timeout=timeout) else: try: IP=''.join(random.choice(self.ip_list).strip()) proxy={'http':IP} returnrequests.get(url,headers=headers,proxy=proxy,timeout=timeout) except: ifnum>0: time.sleep(10) IP=''.join(random.choice(self.ip_list).strip()) proxy={'http':IP} print("正在更换代理") print("当前代理%s"%proxy) returnself.get(url,proxy=proxy,num=num-1)
main.py
将爬取的图片保存到本地,然后展示到界面
importkongfuzi importos importrequests importbs4 fromtkinterimport* fromPILimportImage,ImageTk #下载图片,生成图片地址列表和图书信息列表 defdownload(): baseUrl="http://search.kongfz.com" keyword=e1.get() url=baseUrl+"/product_result/?select=0&key="+keyword print("下载链接:"+url) show(url) #bs4处理 defchangesoup(html): htm=html.content html_doc=str(htm,'utf-8') soup=bs4.BeautifulSoup(html_doc,"html.parser") returnsoup #图书信息集合 defbookinfo(soup): #图书价格列表 price=[] soupprice=soup.select(".first-info.f_right.bold") foriinsoupprice: price.append(i.string) #书店名列表 storename=[] soupstorename=soup.select(".textaspan") foreachinsoupstorename: ifeach.string==None: soupstorename.remove(each) foriinsoupstorename: storename.append(i.string) #商家地区列表 place=[] soupplace=soup.select(".user-place") foriinsoupplace: place.append(i.string) #书名列表 bookname=[] bookname1=soup.select( ".search-wrap.search-main.search-main-result.result-content.result-list.item.item-info.title.link") #print(len(bookname1)) #print(bookname1) foreachinbookname1: print(each) #a=bs4.BeautifulSoup(each,"html.parser") a=each.get_text() print(a) #type(a) #a=bs4.BeautifulSoup(a,"html.parser") #b=a.get_text() bookname.append(a) #print(bookname) #print(len(bookname)) returnbookname,price,place,storename #保存图片 defimgsave(soup): dirName="image" os.makedirs(dirName,exist_ok=True) filePathList=[] imgUrl=soup.select(".search-main-result.result-content.result-list.item.item-img.img-boximg") #print(imgUrl) ifnotimgUrl: print("没有找到当前节点下图片") else: i=0 forimageUrlsinimgUrl: #找到图片地址获取它 downloadUrl=imageUrls.get('src') #ifdownloadUrl=="/searchfront/img/error.jpg": #downloadUrl="http://book.kongfz.com/img/pc/error.jpg" print("打印要下载的图片地址:",downloadUrl) #http://book.kongfz.com/img/pc/error.jpg #分割字符 split=downloadUrl.split("/") #只保留最后一个元素 fileName=str(i)+"-"+os.path.basename(split[len(split)-1]) print("文件名:"+fileName) #建立一个新路径 filePath=os.path.join(dirName,fileName) filePathList.append(filePath) ifnotos.path.exists(filePath): imageUrlPath=requests.get(downloadUrl) #检查当前网络是否请求成功 imageUrlPath.raise_for_status() #'wb'二进制模式打开img适用 imageFile=open(filePath,'wb') forimageinimageUrlPath.iter_content(10000): #把每次遍历的文件图像都存储进文件夹中 imageFile.write(image) #关闭文件 imageFile.close() i=i+1 returnfilePathList #图片展示 defshow(url): xz=kongfuzi.DownLoad() html=xz.get(url) #添加代理ip到ip_list add_ip=e2.get() xz.ip_list.append(add_ip) soup=changesoup(html) bookname,price,place,storename=bookinfo(soup) #print(bookname) #print(price) #print(place) #print(storename) filePathList=imgsave(soup) root1=Toplevel() root1.geometry("1720x800") root1.title("孔网图片爬取") #处理图片,转换成可以显示 photo=[] temp=[] foreachinfilePathList: temp=Image.open(each) photo.append(ImageTk.PhotoImage(temp)) canvas=Canvas(root1,width=1700,height=800,scrollregion=(0,0,0,4000))#创建canvas canvas.place(x=10,y=10)#放置canvas的位置 frame=Frame(canvas)#把frame放在canvas里 frame.place(width=1680,height=800) foriinrange(50): #图片行列 rownum=int(i/5) columnnum=i%5 #photo=ImageTk.PhotoImage(Image.open(filePathList[i])) imgLabel1=Label(frame,image=photo[i],width=280,height=280) imgLabel1.grid(row=rownum*5,column=columnnum,padx=10,pady=5) infoLabel1=Label(frame,text="书名:"+bookname[i],bg="#FFF8DC",justify=LEFT) infoLabel1.grid(row=rownum*5+1,column=columnnum,padx=45,pady=2,sticky=W) infoLabel2=Label(frame,text="价格:"+price[i]+"元",bg="#FFF8DC",justify=LEFT) infoLabel2.grid(row=rownum*5+2,column=columnnum,padx=45,pady=2,sticky=W) infoLabel3=Label(frame,text="发货地区:"+place[i],bg="#FFF8DC",justify=LEFT) infoLabel3.grid(row=rownum*5+3,column=columnnum,padx=45,pady=2,sticky=W) infoLabel4=Label(frame,text="书店:"+storename[i],bg="#FFF8DC",justify=LEFT) infoLabel4.grid(row=rownum*5+4,column=columnnum,padx=45,pady=2,sticky=W) vbar=Scrollbar(canvas,orient=VERTICAL)#竖直滚动条 vbar.place(x=1680,width=20,height=800) vbar.configure(command=canvas.yview) canvas.config(yscrollcommand=vbar.set)#设置 canvas.create_window((800,2000),window=frame) mainloop() if__name__=='__main__': #界面 root=Tk() root.title("孔网图片爬取") e1=Entry(root) e2=Entry(root) e1.grid(row=0,column=0,padx=20,pady=20) e2.grid(row=0,column=2,padx=20,pady=20) label1=Label(root,text="关键字",width=10).grid(row=0,column=1,padx=10,pady=5) label2=Label(root,text="添加代理ip",width=10).grid(row=0,column=3,padx=10,pady=5) btn1=Button(root,text="搜索",width=10,command=download).grid(row=1,column=1,padx=10,pady=5) #print(e1.get()) mainloop()
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。