基于python实现的抓取腾讯视频所有电影的爬虫
我搜集了国内10几个电影网站的数据,里面近几十W条记录,用文本没法存,mongodb学习成本非常低,安装、下载、运行起来不会花你5分钟时间。
#-*-coding:utf-8-*- #byawakenjoys.mysite:www.dianying.at importre importurllib2 frombs4importBeautifulSoup importstring,time importpymongo NUM=0#全局变量,电影数量 m_type=u''#全局变量,电影类型 m_site=u'qq'#全局变量,电影网站 #根据指定的URL获取网页内容 defgethtml(url): req=urllib2.Request(url) response=urllib2.urlopen(req) html=response.read() returnhtml #从电影分类列表页面获取电影分类 defgettags(html): globalm_type soup=BeautifulSoup(html)#过滤出分类内容 #printsoup #<ulclass="clearfix_group"gname="mi_type"gtype="1"> tags_all=soup.find_all('ul',{'class':'clearfix_group','gname':'mi_type'}) #printlen(tags_all),tags_all #printstr(tags_all[1]).replace('\n','') #<a_hot="tag.sub"class="_gtag_hotkey"href="http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html"title="动作"tvalue="0">动作</a> re_tags=r'<a_hot=\"tag\.sub\"class=\"_gtag_hotkey\"href=\"(.+?)\"title=\"(.+?)\"tvalue=\"(.+?)\">.+?</a>' p=re.compile(re_tags,re.DOTALL) tags=p.findall(str(tags_all[0])) iftags: tags_url={} #printtags fortagintags: tag_url=tag[0].decode('utf-8') #printtag_url m_type=tag[1].decode('utf-8') tags_url[m_type]=tag_url else: print"NotFind" returntags_url #获取每个分类的页数 defget_pages(tag_url): tag_html=gethtml(tag_url) #divclass="paginator soup=BeautifulSoup(tag_html)#过滤出标记页面的html #printsoup #<divclass="mod_pagenav"id="pager"> div_page=soup.find_all('div',{'class':'mod_pagenav','id':'pager'}) #printdiv_page#len(div_page),div_page[0] #<aclass="c_txt6"href="http://v.qq.com/list/1_2_-1_-1_1_0_24_20_0_-1_0.html"title="25"><span>25</span></a> re_pages=r'<aclass=.+?><span>(.+?)</span></a>' p=re.compile(re_pages,re.DOTALL) pages=p.findall(str(div_page[0])) #printpages iflen(pages)>1: returnpages[-2] else: return1 defgetmovielist(html): soup=BeautifulSoup(html) #<ulclass="mod_list_pic_130"> divs=soup.find_all('ul',{'class':'mod_list_pic_130'}) #printdivs fordiv_htmlindivs: div_html=str(div_html).replace('\n','') #printdiv_html getmovie(div_html) defgetmovie(html): globalNUM globalm_type globalm_site #<h6class="caption"><ahref="http://www.tudou.com/albumcover/Z7eF_40EL4I.html"target="_blank"title="徒步旅行队">徒步旅行队</a></h6><ulclass="info"><liclass="desc">法国卖座喜剧片</li><liclass="cast"></li></ul></div><divclass="extext_last"><divclass="ext_txt"><h3class="ext_title">徒步旅行队</h3><divclass="ext_info"><spanclass="ext_area">地区:法国</span><spanclass="ext_cast">导演:</span><spanclass="ext_date">年代:2009</span><spanclass="ext_type">类型:喜剧</span></div><pclass="ext_intro">理查德·达奇拥有一家小的旅游公司,主要经营法国游客到非洲大草原的旅游服务。六个法国游客决定参加理查德·达奇组织的到非洲的一...</p> re_movie=r'<li><aclass=\"mod_poster_130\"href=\"(.+?)\"target=\"_blank\"title=\"(.+?)\"><img.+?</li>' p=re.compile(re_movie,re.DOTALL) movies=p.findall(html) ifmovies: conn=pymongo.Connection('localhost',27017) movie_db=conn.dianying playlinks=movie_db.playlinks #printmovies formovieinmovies: #printmovie NUM+=1 print"%s:%d"%("="*70,NUM) values=dict( movie_title=movie[1], movie_url=movie[0], movie_site=m_site, movie_type=m_type ) printvalues playlinks.insert(values) print"_"*70 NUM+=1 print"%s:%d"%("="*70,NUM) #else: #print"NotFind" defgetmovieinfo(url): html=gethtml(url) soup=BeautifulSoup(html) #packpack_albumalbum_cover divs=soup.find_all('div',{'class':'packpack_albumalbum_cover'}) #printdivs[0] #<ahref="http://www.tudou.com/albumplay/9NyofXc_lHI/32JqhiKJykI.html"target="new"title="《血滴子》独家纪录片"wl="1"></a> re_info=r'<ahref=\"(.+?)\"target=\"new\"title=\"(.+?)\"wl=\".+?\"></a>' p_info=re.compile(re_info,re.DOTALL) m_info=p_info.findall(str(divs[0])) ifm_info: returnm_info else: print"Notfindmovieinfo" returnm_info definsertdb(movieinfo): globalconn movie_db=conn.dianying_at movies=movie_db.movies movies.insert(movieinfo) if__name__=="__main__": globalconn tags_url="http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html" #printtags_url tags_html=gethtml(tags_url) #printtags_html tag_urls=gettags(tags_html) #printtag_urls forurlintag_urls.items(): printstr(url[1]).encode('utf-8')#,url[0] maxpage=int(get_pages(str(url[1]).encode('utf-8'))) printmaxpage forxinrange(0,maxpage): #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html m_url=str(url[1]).replace('0_20_0_-1_0.html','') movie_url="%s%d_20_0_-1_0.html"%(m_url,x) printmovie_url movie_html=gethtml(movie_url.encode('utf-8')) #printmovie_html getmovielist(movie_html) time.sleep(0.1)