Python爬虫包BeautifulSoup学习实例(五)
本文为大家分享了Python爬虫包BeautifulSoup学习实例,具体内容如下
BeautifulSoup
使用BeautifulSoup抓取豆瓣电影的一些信息。
#-*-coding:utf-8-*- #@Author:HaonanWu #@Date:2016-12-2416:18:01 #@LastModifiedby:HaonanWu #@LastModifiedtime:2016-12-2417:25:33 importurllib2 importjson frombs4importBeautifulSoup defnowplaying_movies(url): user_agent='Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36' headers={'User-Agent':user_agent} request=urllib2.Request(url=url,headers=headers) response=urllib2.urlopen(request) soup_packetpage=BeautifulSoup(response,'lxml') items=soup_packetpage.findAll("li",class_="list-item") #items=soup_packetpage.findAll("li",{"class":"list-item"})等价写法 movies=[] foriteminitems: ifitem.attrs['data-category']=='nowplaying': movie={} movie['title']=item.attrs['data-title'] movie['score']=item.attrs['data-score'] movie['director']=item.attrs['data-director'] movie['actors']=item.attrs['data-actors'] movies.append(movie) print('%(title)s|%(score)s|%(director)s|%(actors)s'%movie) returnmovies if__name__=='__main__': url='https://movie.douban.com/nowplaying/beijing/' movies=nowplaying_movies(url) print('%s'%json.dumps(movies,sort_keys=True,indent=4,separators=(',',':')))
HTMLParser
使用HTMLParser实现上述功能
这里有一些HTMLParser的基础教程
由于HtmlParser自2006年以后就再没更新,目前很多人推荐使用jsoup代替它。
#-*-coding:utf-8-*- #@Author:HaonanWu #@Date:2016-12-2415:57:54 #@LastModifiedby:HaonanWu #@LastModifiedtime:2016-12-2417:03:27 fromHTMLParserimportHTMLParser importurllib2 importjson classMovieParser(HTMLParser): def__init__(self): HTMLParser.__init__(self) self.movies=[] defhandle_starttag(self,tag,attrs): def_attr(attrlist,attrname): forattrinattrlist: ifattr[0]==attrname: returnattr[1] returnNone iftag=='li'and_attr(attrs,'data-title')and_attr(attrs,'data-category')=='nowplaying': movie={} movie['title']=_attr(attrs,'data-title') movie['score']=_attr(attrs,'data-score') movie['director']=_attr(attrs,'data-director') movie['actors']=_attr(attrs,'data-actors') self.movies.append(movie) print('%(title)s|%(score)s|%(director)s|%(actors)s'%movie) defnowplaying_movies(url): headers={'User-Agent':'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/54.0.2840.99Safari/537.36'} req=urllib2.Request(url,headers=headers) s=urllib2.urlopen(req) parser=MovieParser() parser.feed(s.read()) s.close() returnparser.movies if__name__=='__main__': url='https://movie.douban.com/nowplaying/beijing/' movies=nowplaying_movies(url) print('%s'%json.dumps(movies,sort_keys=True,indent=4,separators=(',',':')))
以上全部为本篇文章的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。