python制作最美应用的爬虫
安卓最美应用页面爬虫,爬虫很简单,设计的东西到挺多的
文件操作
正则表达式
字符串替换等等
importrequests importre url="http://zuimeia.com" r=requests.get('http://zuimeia.com/community/app/hot/?platform=2') pattern=re.compile(r'<aclass="community-app-cover-wrapper"href="(.*?)"target="_blank">') urlList=pattern.findall(r.content) defrequestsUrl(url): r=requests.get(url) title=re.findall(r'"app-title"><h1>(.*?)</h1>',r.content) #printtitle category=re.findall(r'<aclass="app-tag"href="/community/app/category/title/.*?/?platform=2">(.*?)</a>',r.content) #printcategory describe=re.findall(r'<divid="article_content">(.*?)<divclass="community-image-wrapper">',r.content) #printtype(describe[0]) strdescribe=srtReplace(describe[0]) #printstrdescribe downloadUrl=re.findall(r'<aclass="download-buttondirecthidden"href="(.*?)"',r.content) #printdownloadUrl returntitle,category,strdescribe,downloadUrl defsrtReplace(string): listReplace=['<p>','<br>','<h1>','<h2>','<h3>','<h4>','<h5>','<h6>','<h7>','<strong>','</p>','<br/>','</h1>','</h2>','</h3>','</h4>','</h5>', '</h6>','</h7>','</strong>','<b>','</b>'] foreachListReplaceinlistReplace: string=string.replace(str(eachListReplace),'\n') string=string.replace('\n\n','') returnstring defcategornFinal(category): categoryFinal='' foreachCategoryincategory: categoryFinal=categoryFinal+str(eachCategory)+'-->' returncategoryFinal defurlReplace(url): url=url.replace('&','&') returnurl requestsUrl("http://zuimeia.com/community/app/27369/?platform=2") foreachUrlinurlList: eachUrl=url+eachUrl content=requestsUrl(eachUrl) categoryFinal='' title=content[0][0] category=categornFinal(content[1]) strdescribe=content[2] downloadUrl=urlReplace(content[3][0]) withopen('c:/wqa.txt','a+')asfd: fd.write('title:'+title+'\n'+'category:'+category+'\n'+'strdescribe:'+strdescribe+'\n'+'downloadUrl:'+downloadUrl+'\n\n\n-----------------------------------------------------------------------------------------------------------------------------\n\n\n')