Python实现多线程抓取妹子图
心血来潮写了个多线程抓妹子图,虽然代码还是有一些瑕疵,但是还是记录下来,分享给大家。
Pic_downloader.py
#-*-coding:utf-8-*- """ CreatedonFriAug0717:30:582015 @author:Dreace """ importurllib2 importsys importtime importos importrandom frommultiprocessing.dummyimportPoolasThreadPool type_=sys.getfilesystemencoding() defrename(): returntime.strftime("%Y%m%d%H%M%S") defrename_2(name): iflen(name)==2: name='0'+name+'.jpg' eliflen(name)==1: name='00'+name+'.jpg' else: name=name+'.jpg' returnname defdownload_pic(i): globalcount globaltime_out ifFilter(i): try: content=urllib2.urlopen(i,timeout=time_out) url_content=content.read() f=open(repr(random.randint(10000,999999999))+"_"+rename_2(repr(count)),"wb") f.write(url_content) f.close() count+=1 exceptException,e: printi+"下载超时,跳过!".decode("utf-8").encode(type_) defFilter(content): forlineinFilter_list: line=line.strip('\n') ifcontent.find(line)==-1: returnTrue defget_pic(url_address): globalpic_list try: str_=urllib2.urlopen(url_address,timeout=time_out).read() url_content=str_.split("\"") foriinurl_content: ifi.find(".jpg")!=-1: pic_list.append(i) exceptException,e: print"获取图片超时,跳过!".decode("utf-8").encode(type_) MAX=2 count=0 time_out=60 thread_num=30 pic_list=[] page_list=[] Filter_list=["imgsize.ph.126.net","img.ph.126.net","img2.ph.126.net"] dir_name="C:\Photos\\"+rename() os.makedirs(dir_name) os.chdir(dir_name) start_time=time.time() url_address="http://sexy.faceks.com/?page=" foriinrange(1,MAX+1): page_list.append(url_address+repr(i)) page_pool=ThreadPool(thread_num) page_pool.map(get_pic,page_list) print"获取到".decode("utf-8").encode(type_),len(pic_list),"张图片,开始下载!".decode("utf-8").encode(type_) pool=ThreadPool(thread_num) pool.map(download_pic,pic_list) pool.close() pool.join() printcount,"张图片保存在".decode("utf-8").encode(type_)+dir_name print"共耗时".decode("utf-8").encode(type_),time.time()-start_time,"s"
我们来看下一个网友的作品
#coding:utf-8############################################################# #FileName:main.py #Author:mylonly #mail:mylonly@gmail.com #CreatedTime:Wed11Jun201408:22:12PMCST ######################################################################### #!/usr/bin/python importre,urllib2,HTMLParser,threading,Queue,time #各图集入口链接 htmlDoorList=[] #包含图片的Hmtl链接 htmlUrlList=[] #图片Url链接Queue imageUrlList=Queue.Queue(0) #捕获图片数量 imageGetCount=0 #已下载图片数量 imageDownloadCount=0 #每个图集的起始地址,用于判断终止 nextHtmlUrl='' #本地保存路径 localSavePath='/data/1920x1080/' #如果你想下你需要的分辨率的,请修改replace_str,有如下分辨率可供选择1920x1200,1980x1920,1680x1050,1600x900,1440x900,1366x768,1280x1024,1024x768,1280x800 replace_str='1920x1080' replaced_str='960x600' #内页分析处理类 classImageHtmlParser(HTMLParser.HTMLParser): def__init__(self): self.nextUrl='' HTMLParser.HTMLParser.__init__(self) defhandle_starttag(self,tag,attrs): globalimageUrlList if(tag=='img'andlen(attrs)>2): if(attrs[0]==('id','bigImg')): url=attrs[1][1] url=url.replace(replaced_str,replace_str) imageUrlList.put(url) globalimageGetCount imageGetCount=imageGetCount+1 printurl elif(tag=='a'andlen(attrs)==4): if(attrs[0]==('id','pageNext')andattrs[1]==('class','next')): globalnextHtmlUrl nextHtmlUrl=attrs[2][1]; #首页分析类 classIndexHtmlParser(HTMLParser.HTMLParser): def__init__(self): self.urlList=[] self.index=0 self.nextUrl='' self.tagList=['li','a'] self.classList=['photo-list-padding','pic'] HTMLParser.HTMLParser.__init__(self) defhandle_starttag(self,tag,attrs): if(tag==self.tagList[self.index]): forattrinattrs: if(attr[1]==self.classList[self.index]): if(self.index==0): #第一层找到了 self.index=1 else: #第二层找到了 self.index=0 printattrs[1][1] self.urlList.append(attrs[1][1]) break elif(tag=='a'): forattrinattrs: if(attr[0]=='id'andattr[1]=='pageNext'): self.nextUrl=attrs[1][1] print'nextUrl:',self.nextUrl break #首页Hmtl解析器 indexParser=IndexHtmlParser() #内页Html解析器 imageParser=ImageHtmlParser() #根据首页得到所有入口链接 print'开始扫描首页...' host='http://desk.zol.com.cn' indexUrl='/meinv/' while(indexUrl!=''): print'正在抓取网页:',host+indexUrl request=urllib2.Request(host+indexUrl) try: m=urllib2.urlopen(request) con=m.read() indexParser.feed(con) if(indexUrl==indexParser.nextUrl): break else: indexUrl=indexParser.nextUrl excepturllib2.URLError,e: printe.reason print'首页扫描完成,所有图集链接已获得:' htmlDoorList=indexParser.urlList #根据入口链接得到所有图片的url classgetImageUrl(threading.Thread): def__init__(self): threading.Thread.__init__(self) defrun(self): fordoorinhtmlDoorList: print'开始获取图片地址,入口地址为:',door globalnextHtmlUrl nextHtmlUrl='' while(door!=''): print'开始从网页%s获取图片...'%(host+door) if(nextHtmlUrl!=''): request=urllib2.Request(host+nextHtmlUrl) else: request=urllib2.Request(host+door) try: m=urllib2.urlopen(request) con=m.read() imageParser.feed(con) print'下一个页面地址为:',nextHtmlUrl if(door==nextHtmlUrl): break excepturllib2.URLError,e: printe.reason print'所有图片地址均已获得:',imageUrlList classgetImage(threading.Thread): def__init__(self): threading.Thread.__init__(self) defrun(self): globalimageUrlList print'开始下载图片...' while(True): print'目前捕获图片数量:',imageGetCount print'已下载图片数量:',imageDownloadCount image=imageUrlList.get() print'下载文件路径:',image try: cont=urllib2.urlopen(image).read() patter='[0-9]*\.jpg'; match=re.search(patter,image); ifmatch: print'正在下载文件:',match.group() filename=localSavePath+match.group() f=open(filename,'wb') f.write(cont) f.close() globalimageDownloadCount imageDownloadCount=imageDownloadCount+1 else: print'nomatch' if(imageUrlList.empty()): break excepturllib2.URLError,e: printe.reason print'文件全部下载完成...' get=getImageUrl() get.start() print'获取图片链接线程启动:' time.sleep(2) download=getImage() download.start() print'下载图片链接线程启动:'
批量抓取指定网页上的所有图片
#-*-coding:utf-8-*- #coding=UTF-8 importos,urllib,urllib2,re url=u"http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=python&oq=python&rsp=-1" outpath="t:\\" defgetHtml(url): webfile=urllib.urlopen(url) outhtml=webfile.read() printouthtml returnouthtml defgetImageList(html): restr=ur'(' restr+=ur'http:\/\/[^\s,"]*\.jpg' restr+=ur'|http:\/\/[^\s,"]*\.jpeg' restr+=ur'|http:\/\/[^\s,"]*\.png' restr+=ur'|http:\/\/[^\s,"]*\.gif' restr+=ur'|http:\/\/[^\s,"]*\.bmp' restr+=ur'|https:\/\/[^\s,"]*\.jpeg' restr+=ur'|https:\/\/[^\s,"]*\.jpeg' restr+=ur'|https:\/\/[^\s,"]*\.png' restr+=ur'|https:\/\/[^\s,"]*\.gif' restr+=ur'|https:\/\/[^\s,"]*\.bmp' restr+=ur')' htmlurl=re.compile(restr) imgList=re.findall(htmlurl,html) printimgList returnimgList defdownload(imgList,page): x=1 forimgurlinimgList: filepathname=str(outpath+'pic_%09d_%010d'%(page,x)+str(os.path.splitext(urllib2.unquote(imgurl).decode('utf8').split('/')[-1])[1])).lower() print'[Debug]Downloadfile:'+imgurl+'>>'+filepathname urllib.urlretrieve(imgurl,filepathname) x+=1 defdownImageNum(pagenum): page=1 pageNumber=pagenum while(page<=pageNumber): html=getHtml(url)#获得url指向的html内容 imageList=getImageList(html)#获得所有图片的地址,返回列表 download(imageList,page)#下载所有的图片 page=page+1 if__name__=='__main__': downImageNum(1)
以上就是给大家汇总的3款Python实现的批量抓取妹纸图片的代码了,希望对大家学习Python爬虫能够有所帮助。