python定向爬虫校园论坛帖子信息
引言
写这个小爬虫主要是为了爬校园论坛上的实习信息,主要采用了Requests库
源码
URLs.py
主要功能是根据一个初始url(包含page页面参数)来获得page页面从当前页面数到pageNum的url列表
importre defgetURLs(url,attr,pageNum=1): all_links=[] try: now_page_number=int(re.search(attr+'=(\d+)',url,re.S).group(1)) foriinrange(now_page_number,pageNum+1): new_url=re.sub(attr+'=\d+',attr+'=%s'%i,url,re.S) all_links.append(new_url) returnall_links exceptTypeError: print"argumentsTypeError:attrshouldbestring."
uni_2_native.py
由于论坛上爬取得到的网页上的中文都是unicode编码的形式,文本格式都为XXX;的形式,所以在爬得网站内容后还需要对其进行转换
importsys importre reload(sys) sys.setdefaultencoding('utf-8') defget_native(raw): tostring=raw whileTrue: obj=re.search('(.*?);',tostring,flags=re.S) ifobjisNone: break else: raw,code=obj.group(0),obj.group(1) tostring=re.sub(raw,unichr(int(code)),tostring) returntostring
存入SQLite数据库:saveInfo.py
#-*-coding:utf-8-*- importMySQLdb classsaveSqlite(): def__init__(self): self.infoList=[] defsaveSingle(self,author=None,title=None,date=None,url=None,reply=0,view=0): ifauthorisNoneortitleisNoneordateisNoneorurlisNone: print"Noinfosaved!" else: singleDict={} singleDict['author']=author singleDict['title']=title singleDict['date']=date singleDict['url']=url singleDict['reply']=reply singleDict['view']=view self.infoList.append(singleDict) deftoMySQL(self): conn=MySQLdb.connect(host='localhost',user='root',passwd='',port=3306,db='db_name',charset='utf8') cursor=conn.cursor() #sql="select*frominfo" #n=cursor.execute(sql) #forrowincursor.fetchall(): #forrinrow: #printr #print'\n' sql="deletefrominfo" cursor.execute(sql) conn.commit() sql="insertintoinfo(title,author,url,date,reply,view)values(%s,%s,%s,%s,%s,%s)" params=[] foreachinself.infoList: params.append((each['title'],each['author'],each['url'],each['date'],each['reply'],each['view'])) cursor.executemany(sql,params) conn.commit() cursor.close() conn.close() defshow(self): foreachinself.infoList: print"author:"+each['author'] print"title:"+each['title'] print"date:"+each['date'] print"url:"+each['url'] print"reply:"+str(each['reply']) print"view:"+str(each['view']) print'\n' if__name__=='__main__': save=saveSqlite() save.saveSingle('网','aaa','2008-10-1010:10:10','www.baidu.com',1,1) #save.show() save.toMySQL()
主要爬虫代码
importrequests fromlxmlimportetree fromcc98importuni_2_native,URLs,saveInfo #根据自己所需要爬的网站,伪造一个header headers={ 'Accept':'', 'Accept-Encoding':'', 'Accept-Language':'', 'Connection':'', 'Cookie':'', 'Host':'', 'Referer':'', 'Upgrade-Insecure-Requests':'', 'User-Agent':'' } url='http://www.cc98.org/list.asp?boardid=459&page=1&action=' cc98='http://www.cc98.org/' print"getinfomationfromcc98..." urls=URLs.getURLs(url,"page",50) savetools=saveInfo.saveSqlite() forurlinurls: r=requests.get(url,headers=headers) html=uni_2_native.get_native(r.text) selector=etree.HTML(html) content_tr_list=selector.xpath('//form/table[@class="tableborder1list-topic-table"]/tbody/tr') foreachincontent_tr_list: href=each.xpath('./td[2]/a/@href') iflen(href)==0: continue else: #printlen(href) #notverywellusingfor,thoughjustoneelementinlist #butIdon'tknowwhyIcannotgetthedatabyindex foreach_hrefinhref: link=cc98+each_href title_author_time=each.xpath('./td[2]/a/@title') #printlen(title_author_time) forinfointitle_author_time: info_split=info.split('\n') title=info_split[0][1:len(info_split[0])-1] author=info_split[1][3:] date=info_split[2][3:] hot=each.xpath('./td[4]/text()') #printlen(hot) forhot_numinhot: reply_view=hot_num.strip().split('/') reply,view=reply_view[0],reply_view[1] savetools.saveSingle(author=author,title=title,date=date,url=link,reply=reply,view=view) print"Allgot!NowsavingtoDatabase..." #savetools.show() savetools.toMySQL() print"ALLCLEAR!HaveFun!"
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持毛票票。