python实现爬取千万淘宝商品的方法
本文实例讲述了python实现爬取千万淘宝商品的方法。分享给大家供大家参考。具体实现方法如下:
importtime importleveldb fromurllib.parseimportquote_plus importre importjson importitertools importsys importrequests fromqueueimportQueue fromthreadingimportThread URL_BASE='http://s.m.taobao.com/search?q={}&n=200&m=api4h5&style=list&page={}' defurl_get(url): #print('GET'+url) header=dict() header['Accept']='text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' header['Accept-Encoding']='gzip,deflate,sdch' header['Accept-Language']='en-US,en;q=0.8' header['Connection']='keep-alive' header['DNT']='1' #header['User-Agent']='Mozilla/5.0(Macintosh;IntelMacOSX10_8_0)AppleWebKit/537.36(KHTML,likeGecko)Chrome/28.0.1500.71Safari/537.36' header['User-Agent']='Mozilla/12.0(compatible;MSIE8.0;WindowsNT)' returnrequests.get(url,timeout=5,headers=header).text defitem_thread(cate_queue,db_cate,db_item): whileTrue: try: cate=cate_queue.get() post_exist=True try: state=db_cate.Get(cate.encode('utf-8')) ifstate!=b'OK':post_exist=False except: post_exist=False ifpost_exist==True: print('cate-{}:{}alreadyexists...Ignore'.format(cate,title)) continue db_cate.Put(cate.encode('utf-8'),b'crawling') foritem_pageinitertools.count(1): url=URL_BASE.format(quote_plus(cate),item_page) fortrinrange(5): try: items_obj=json.loads(url_get(url)) break exceptKeyboardInterrupt: quit() exceptExceptionase: iftr==4:raisee iflen(items_obj['listItem'])==0:break foriteminitems_obj['listItem']: item_obj=dict( _id=int(item['itemNumId']), name=item['name'], price=float(item['price']), query=cate, category=int(item['category'])ifitem['category']!=''else0, nick=item['nick'], area=item['area']) db_item.Put(str(item_obj['_id']).encode('utf-8'), json.dumps(item_obj,ensure_ascii=False).encode('utf-8')) print('Get{}itemsfrom{}:{}'.format(len(items_obj['listItem']),cate,item_page)) if'nav'initems_obj: fornainitems_obj['nav']['navCatList']: try: db_cate.Get(na['name'].encode('utf-8')) except: db_cate.Put(na['name'].encode('utf-8'),b'waiting') db_cate.Put(cate.encode('utf-8'),b'OK') print(cate,'OK') exceptKeyboardInterrupt: break exceptExceptionase: print('An{}exceptionoccured'.format(e)) defcate_thread(cate_queue,db_cate): whileTrue: try: forkey,valueindb_cate.RangeIter(): ifvalue!=b'OK': print('CateThread:put{}intoqueue'.format(key.decode('utf-8'))) cate_queue.put(key.decode('utf-8')) time.sleep(10) exceptKeyboardInterrupt: break exceptExceptionase: print('CateThread:{}'.format(e)) if__name__=='__main__': db_cate=leveldb.LevelDB('./taobao-cate') db_item=leveldb.LevelDB('./taobao-item') orig_cate='正装' try: db_cate.Get(orig_cate.encode('utf-8')) except: db_cate.Put(orig_cate.encode('utf-8'),b'waiting') cate_queue=Queue(maxsize=1000) cate_th=Thread(target=cate_thread,args=(cate_queue,db_cate)) cate_th.start() item_th=[Thread(target=item_thread,args=(cate_queue,db_cate,db_item))for_inrange(5)] foritem_tinitem_th: item_t.start() cate_th.join()
希望本文所述对大家的Python程序设计有所帮助。