python实现爬取千万淘宝商品的方法
本文实例讲述了python实现爬取千万淘宝商品的方法。分享给大家供大家参考。具体实现方法如下:
importtime
importleveldb
fromurllib.parseimportquote_plus
importre
importjson
importitertools
importsys
importrequests
fromqueueimportQueue
fromthreadingimportThread
URL_BASE='http://s.m.taobao.com/search?q={}&n=200&m=api4h5&style=list&page={}'
defurl_get(url):
#print('GET'+url)
header=dict()
header['Accept']='text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
header['Accept-Encoding']='gzip,deflate,sdch'
header['Accept-Language']='en-US,en;q=0.8'
header['Connection']='keep-alive'
header['DNT']='1'
#header['User-Agent']='Mozilla/5.0(Macintosh;IntelMacOSX10_8_0)AppleWebKit/537.36(KHTML,likeGecko)Chrome/28.0.1500.71Safari/537.36'
header['User-Agent']='Mozilla/12.0(compatible;MSIE8.0;WindowsNT)'
returnrequests.get(url,timeout=5,headers=header).text
defitem_thread(cate_queue,db_cate,db_item):
whileTrue:
try:
cate=cate_queue.get()
post_exist=True
try:
state=db_cate.Get(cate.encode('utf-8'))
ifstate!=b'OK':post_exist=False
except:
post_exist=False
ifpost_exist==True:
print('cate-{}:{}alreadyexists...Ignore'.format(cate,title))
continue
db_cate.Put(cate.encode('utf-8'),b'crawling')
foritem_pageinitertools.count(1):
url=URL_BASE.format(quote_plus(cate),item_page)
fortrinrange(5):
try:
items_obj=json.loads(url_get(url))
break
exceptKeyboardInterrupt:
quit()
exceptExceptionase:
iftr==4:raisee
iflen(items_obj['listItem'])==0:break
foriteminitems_obj['listItem']:
item_obj=dict(
_id=int(item['itemNumId']),
name=item['name'],
price=float(item['price']),
query=cate,
category=int(item['category'])ifitem['category']!=''else0,
nick=item['nick'],
area=item['area'])
db_item.Put(str(item_obj['_id']).encode('utf-8'),
json.dumps(item_obj,ensure_ascii=False).encode('utf-8'))
print('Get{}itemsfrom{}:{}'.format(len(items_obj['listItem']),cate,item_page))
if'nav'initems_obj:
fornainitems_obj['nav']['navCatList']:
try:
db_cate.Get(na['name'].encode('utf-8'))
except:
db_cate.Put(na['name'].encode('utf-8'),b'waiting')
db_cate.Put(cate.encode('utf-8'),b'OK')
print(cate,'OK')
exceptKeyboardInterrupt:
break
exceptExceptionase:
print('An{}exceptionoccured'.format(e))
defcate_thread(cate_queue,db_cate):
whileTrue:
try:
forkey,valueindb_cate.RangeIter():
ifvalue!=b'OK':
print('CateThread:put{}intoqueue'.format(key.decode('utf-8')))
cate_queue.put(key.decode('utf-8'))
time.sleep(10)
exceptKeyboardInterrupt:
break
exceptExceptionase:
print('CateThread:{}'.format(e))
if__name__=='__main__':
db_cate=leveldb.LevelDB('./taobao-cate')
db_item=leveldb.LevelDB('./taobao-item')
orig_cate='正装'
try:
db_cate.Get(orig_cate.encode('utf-8'))
except:
db_cate.Put(orig_cate.encode('utf-8'),b'waiting')
cate_queue=Queue(maxsize=1000)
cate_th=Thread(target=cate_thread,args=(cate_queue,db_cate))
cate_th.start()
item_th=[Thread(target=item_thread,args=(cate_queue,db_cate,db_item))for_inrange(5)]
foritem_tinitem_th:
item_t.start()
cate_th.join()
希望本文所述对大家的Python程序设计有所帮助。