Python实现的爬虫功能代码
本文实例讲述了Python实现的爬虫功能。分享给大家供大家参考,具体如下:
主要用到urllib2、BeautifulSoup模块
#encoding=utf-8
importre
importrequests
importurllib2
importdatetime
importMySQLdb
frombs4importBeautifulSoup
importsys
reload(sys)
sys.setdefaultencoding("utf-8")
classSplider(object):
def__init__(self):
printu'开始爬取内容...'
##用来获取网页源代码
defgetsource(self,url):
headers={'User-Agent':'Mozilla/5.0(Macintosh;IntelMacOSX10_10_3)AppleWebKit/537.36(KHTML,likeGecko)Chrome/50.0.2652.0Safari/537.36'}
req=urllib2.Request(url=url,headers=headers)
socket=urllib2.urlopen(req)
content=socket.read()
socket.close()
returncontent
##changepage用来生产不同页数的链接
defchangepage(self,url,total_page):
now_page=int(re.search('page/(\d+)',url,re.S).group(1))
page_group=[]
foriinrange(now_page,total_page+1):
link=re.sub('page/(\d+)','page/%d'%i,url,re.S)
page_group.append(link)
returnpage_group
#获取字内容
defgetchildrencon(self,child_url):
conobj={}
content=self.getsource(child_url)
soup=BeautifulSoup(content,'html.parser',from_encoding='utf-8')
content=soup.find('div',{'class':'c-article_content'})
img=re.findall('src="(.*?)"',str(content),re.S)
conobj['con']=content.get_text()
conobj['img']=(';').join(img)
returnconobj
##获取内容
defgetcontent(self,html_doc):
soup=BeautifulSoup(html_doc,'html.parser',from_encoding='utf-8')
tag=soup.find_all('div',{'class':'promo-feed-headline'})
info={}
i=0
forlinkintag:
info[i]={}
title_desc=link.find('h3')
info[i]['title']=title_desc.get_text()
post_date=link.find('div',{'class':'post-date'})
pos_d=post_date['data-date'][0:10]
info[i]['content_time']=pos_d
info[i]['source']='whowhatwear'
source_link=link.find('a',href=re.compile(r"section=fashion-trends"))
source_url='http://www.whowhatwear.com'+source_link['href']
info[i]['source_url']=source_url
in_content=self.getsource(source_url)
in_soup=BeautifulSoup(in_content,'html.parser',from_encoding='utf-8')
soup_content=in_soup.find('section',{'class':'widgets-list-content'})
info[i]['content']=soup_content.get_text().strip('\n')
text_con=in_soup.find('section',{'class':'text'})
summary=text_con.get_text().strip('\n')iftext_con.text!=NoneelseNULL
info[i]['summary']=summary[0:200]+'...';
img_list=re.findall('src="(.*?)"',str(soup_content),re.S)
info[i]['imgs']=(';').join(img_list)
info[i]['create_time']=datetime.datetime.now().strftime("%Y-%m-%d%H:%M:%S")
i+=1
#printinfo
#exit()
returninfo
defsaveinfo(self,content_info):
conn=MySQLdb.Connect(host='127.0.0.1',user='root',passwd='123456',port=3306,db='test',charset='utf8')
cursor=conn.cursor()
foreachincontent_info:
fork,vineach.items():
sql="insertintot_fashion_spider2(`title`,`summary`,`content`,`content_time`,`imgs`,`source`,`source_url`,`create_time`)values('%s','%s','%s','%s','%s','%s','%s','%s')"%(MySQLdb.escape_string(v['title']),MySQLdb.escape_string(v['summary']),MySQLdb.escape_string(v['content']),v['content_time'],v['imgs'],v['source'],v['source_url'],v['create_time'])
cursor.execute(sql)
conn.commit()
cursor.close()
conn.close()
if__name__=='__main__':
classinfo=[]
p_num=5
url='http://www.whowhatwear.com/section/fashion-trends/page/1'
jikesplider=Splider()
all_links=jikesplider.changepage(url,p_num)
forlinkinall_links:
printu'正在处理页面:'+link
html=jikesplider.getsource(link)
info=jikesplider.getcontent(html)
classinfo.append(info)
jikesplider.saveinfo(classinfo)
更多关于Python相关内容可查看本站专题:《PythonSocket编程技巧总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》
希望本文所述对大家Python程序设计有所帮助。