Python大数据之从网页上爬取数据的方法详解
本文实例讲述了Python大数据之从网页上爬取数据的方法。分享给大家供大家参考,具体如下:
myspider.py :
#!/usr/bin/python #-*-coding:utf-8-*- fromscrapy.spidersimportSpider fromlxmlimportetree fromjredu.itemsimportJreduItem classJreduSpider(Spider): name='tt'#爬虫的名字,必须的,唯一的 allowed_domains=['sohu.com'] start_urls=[ 'http://www.sohu.com' ] defparse(self,response): content=response.body.decode('utf-8') dom=etree.HTML(content) forulindom.xpath("//div[@class='focus-news-box']/div[@class='list16']/ul"): lis=ul.xpath("./li") forliinlis: item=JreduItem()#定义对象 iful.index(li)==0: strong=li.xpath("./a/strong/text()") li.xpath("./a/@href") item['title']=strong[0] item['href']=li.xpath("./a/@href")[0] else: la=li.xpath("./a[last()]/text()") item['title']=la[0] item['href']=li.xpath("./a[last()]/href")[0] yielditem
items.py :
#-*-coding:utf-8-*- #Defineherethemodelsforyourscrapeditems # #Seedocumentationin: #http://doc.scrapy.org/en/latest/topics/items.html importscrapy classJreduItem(scrapy.Item):#相当于Java里的实体类 #definethefieldsforyouritemherelike: #name=scrapy.Field() title=scrapy.Field()#创建一个field对象 href=scrapy.Field() pass
middlewares.py :
#-*-coding:utf-8-*- #Defineherethemodelsforyourspidermiddleware # #Seedocumentationin: #http://doc.scrapy.org/en/latest/topics/spider-middleware.html fromscrapyimportsignals classJreduSpiderMiddleware(object): #Notallmethodsneedtobedefined.Ifamethodisnotdefined, #scrapyactsasifthespidermiddlewaredoesnotmodifythe #passedobjects. @classmethod deffrom_crawler(cls,crawler): #ThismethodisusedbyScrapytocreateyourspiders. s=cls() crawler.signals.connect(s.spider_opened,signal=signals.spider_opened) returns defprocess_spider_input(self,response,spider): #Calledforeachresponsethatgoesthroughthespider #middlewareandintothespider. #ShouldreturnNoneorraiseanexception. returnNone defprocess_spider_output(self,response,result,spider): #CalledwiththeresultsreturnedfromtheSpider,after #ithasprocessedtheresponse. #MustreturnaniterableofRequest,dictorItemobjects. foriinresult: yieldi defprocess_spider_exception(self,response,exception,spider): #Calledwhenaspiderorprocess_spider_input()method #(fromotherspidermiddleware)raisesanexception. #ShouldreturneitherNoneoraniterableofResponse,dict #orItemobjects. pass defprocess_start_requests(self,start_requests,spider): #Calledwiththestartrequestsofthespider,andworks #similarlytotheprocess_spider_output()method,except #thatitdoesn'thavearesponseassociated. #Mustreturnonlyrequests(notitems). forrinstart_requests: yieldr defspider_opened(self,spider): spider.logger.info('Spideropened:%s'%spider.name)
pipelines.py :
#-*-coding:utf-8-*- #Defineyouritempipelineshere # #Don'tforgettoaddyourpipelinetotheITEM_PIPELINESsetting #See:http://doc.scrapy.org/en/latest/topics/item-pipeline.html importcodecs importjson classJreduPipeline(object): def__init__(self): self.fill=codecs.open("data.txt",encoding="utf-8",mode="wb"); defprocess_item(self,item,spider): line=json.dumps(dict(item))+"\n" self.fill.write(line) returnitem
settings.py :
#-*-coding:utf-8-*- #Scrapysettingsforjreduproject # #Forsimplicity,thisfilecontainsonlysettingsconsideredimportantor #commonlyused.Youcanfindmoresettingsconsultingthedocumentation: # #http://doc.scrapy.org/en/latest/topics/settings.html #http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html BOT_NAME='jredu' SPIDER_MODULES=['jredu.spiders'] NEWSPIDER_MODULE='jredu.spiders' #Crawlresponsiblybyidentifyingyourself(andyourwebsite)ontheuser-agent #USER_AGENT='jredu(+http://www.yourdomain.com)' #Obeyrobots.txtrules ROBOTSTXT_OBEY=True #ConfiguremaximumconcurrentrequestsperformedbyScrapy(default:16) #CONCURRENT_REQUESTS=32 #Configureadelayforrequestsforthesamewebsite(default:0) #Seehttp://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay #Seealsoautothrottlesettingsanddocs #DOWNLOAD_DELAY=3 #Thedownloaddelaysettingwillhonoronlyoneof: #CONCURRENT_REQUESTS_PER_DOMAIN=16 #CONCURRENT_REQUESTS_PER_IP=16 #Disablecookies(enabledbydefault) #COOKIES_ENABLED=False #DisableTelnetConsole(enabledbydefault) #TELNETCONSOLE_ENABLED=False #Overridethedefaultrequestheaders: #DEFAULT_REQUEST_HEADERS={ #'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', #'Accept-Language':'en', #} #Enableordisablespidermiddlewares #Seehttp://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES={ #'jredu.middlewares.JreduSpiderMiddleware':543, #} #Enableordisabledownloadermiddlewares #Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES={ #'jredu.middlewares.MyCustomDownloaderMiddleware':543, #} #Enableordisableextensions #Seehttp://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS={ #'scrapy.extensions.telnet.TelnetConsole':None, #} #Configureitempipelines #Seehttp://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES={ 'jredu.pipelines.JreduPipeline':300, } #EnableandconfiguretheAutoThrottleextension(disabledbydefault) #Seehttp://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED=True #Theinitialdownloaddelay #AUTOTHROTTLE_START_DELAY=5 #Themaximumdownloaddelaytobesetincaseofhighlatencies #AUTOTHROTTLE_MAX_DELAY=60 #TheaveragenumberofrequestsScrapyshouldbesendinginparallelto #eachremoteserver #AUTOTHROTTLE_TARGET_CONCURRENCY=1.0 #Enableshowingthrottlingstatsforeveryresponsereceived: #AUTOTHROTTLE_DEBUG=False #EnableandconfigureHTTPcaching(disabledbydefault) #Seehttp://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED=True #HTTPCACHE_EXPIRATION_SECS=0 #HTTPCACHE_DIR='httpcache' #HTTPCACHE_IGNORE_HTTP_CODES=[] #HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
最后需要一个程序入口的方法:
main.py :
#!/usr/bin/python #-*-coding:utf-8-*- #爬虫文件的执行入口 fromscrapyimportcmdline cmdline.execute("scrapycrawltt".split())
更多关于Python相关内容可查看本站专题:《PythonSocket编程技巧总结》、《Python正则表达式用法总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串操作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录操作技巧汇总》
希望本文所述对大家Python程序设计有所帮助。
声明:本文内容来源于网络,版权归原作者所有,内容由互联网用户自发贡献自行上传,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任。如果您发现有涉嫌版权的内容,欢迎发送邮件至:czq8825#qq.com(发邮件时,请将#更换为@)进行举报,并提供相关证据,一经查实,本站将立刻删除涉嫌侵权内容。