在Python中使用CasperJS获取JS渲染生成的HTML内容的教程
文章摘要:其实这里casperjs与python没有直接关系,主要依赖casperjs调用phantomjswebkit获取html文件内容。长期以来,爬虫抓取客户端javascript渲染生成的html页面都极为困难,Java里面有HtmlUnit,而Python里,我们可以使用独立的跨平台的CasperJS。
创建site.js(接口文件,输入:url,输出:htmlfile)
//USAGE:E:\toolkit\n1k0-casperjs-e3a77d0\bin>pythoncasperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile='temp.html' varfs=require('fs'); varcasper=require('casper').create({ pageSettings:{ loadImages:false, loadPlugins:false, userAgent:'Mozilla/5.0(WindowsNT6.1)AppleWebKit/537.36(KHTML,likeGecko)Chrome/34.0.1847.137Safari/537.36LBBROWSER' }, logLevel:"debug",//日志等级 verbose:true//记录日志到控制台 }); varurl=casper.cli.raw.get('url'); varoutputfile=casper.cli.raw.get('outputfile'); //请求页面 casper.start(url,function(){ fs.write(outputfile,this.getHTML(),'w'); }); casper.run();
python代码,checkout_proxy.py
importjson importsys #importrequests #importrequests.utils,pickle frombs4importBeautifulSoup importos.path,os importthreading #frommultiprocessingimportProcess,Manager fromdatetimeimportdatetime importtraceback importlogging importre,random importsubprocess importshutil importplatform output_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'proxy.txt') global_log='http_proxy'+datetime.now().strftime('%Y-%m-%d')+'.log' ifnotos.path.exists(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs')): os.mkdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs')) global_log=os.path.join(os.path.dirname(os.path.realpath(__file__)),'logs',global_log) logging.basicConfig(level=logging.DEBUG,format='[%(asctime)s][%(levelname)s][%(module)s][%(funcName)s][%(lineno)d]%(message)s',filename=global_log,filemode='a') log=logging.getLogger(__name__) #manager=Manager() #PROXY_LIST=manager.list() mutex=threading.Lock() PROXY_LIST=[] defisWindows(): if"Windows"instr(platform.uname()): returnTrue else: returnFalse defgetTagsByAttrs(tagName,pageContent,attrName,attrRegValue): soup=BeautifulSoup(pageContent) returnsoup.find_all(tagName,{attrName:re.compile(attrRegValue)}) defgetTagsByAttrsExt(tagName,filename,attrName,attrRegValue): ifos.path.isfile(filename): f=open(filename,'r') soup=BeautifulSoup(f) f.close() returnsoup.find_all(tagName,{attrName:re.compile(attrRegValue)}) else: returnNone classSite1Thread(threading.Thread): def__init__(self,outputFilePath): threading.Thread.__init__(self) self.outputFilePath=outputFilePath self.fileName=str(random.randint(100,1000))+".html" self.setName('Site1Thread') defrun(self): site1_file=os.path.join(os.path.dirname(os.path.realpath(__file__)),'site.js') site2_file=os.path.join(self.outputFilePath,'site.js') ifnotos.path.isfile(site2_file)andos.path.isfile(site1_file): shutil.copy(site1_file,site2_file) #proc=subprocess.Popen(["bash","-c","cd%s&&./casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE) ifisWindows(): proc=subprocess.Popen(["cmd","/c","%s/casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE) else: proc=subprocess.Popen(["bash","-c","cd%s&&./casperjssite.js--url=http://spys.ru/free-proxy-list/IE/--outputfile=%s"%(self.outputFilePath,self.fileName)],stdout=subprocess.PIPE) out=proc.communicate()[0] htmlFileName='' #因为输出路径在windows不确定,所以这里加了所有可能的路径判断 ifos.path.isfile(self.fileName): htmlFileName=self.fileName elifos.path.isfile(os.path.join(self.outputFilePath,self.fileName)): htmlFileName=os.path.join(self.outputFilePath,self.fileName) elifos.path.isfile(os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName)): htmlFileName=os.path.join(os.path.dirname(os.path.realpath(__file__)),self.fileName) if(notos.path.isfile(htmlFileName)): print'Failedtogethtmlcontentfromhttp://spys.ru/free-proxy-list/IE/' printout sys.exit(3) mutex.acquire() PROXYList=getTagsByAttrsExt('font',htmlFileName,'class','spy14$') forproxyinPROXYList: tdContent=proxy.renderContents() lineElems=re.split('[<>]',tdContent) ifre.compile(r'\d+').search(lineElems[-1])andre.compile('(\d+\.\d+\.\d+)').search(lineElems[0]): printlineElems[0],lineElems[-1] PROXY_LIST.append("%s:%s"%(lineElems[0],lineElems[-1])) mutex.release() try: ifos.path.isfile(htmlFileName): os.remove(htmlFileName) except: pass if__name__=='__main__': try: if(len(sys.argv))<2: print"Usage:%s[casperjspath]"%(sys.argv[0]) sys.exit(1) ifnotos.path.exists(sys.argv[1]): print"casperjspath:%sdoesnotexist!"%(sys.argv[1]) sys.exit(2) ifos.path.isfile(output_file): f=open(output_file) lines=f.readlines() f.close forlineinlines: PROXY_LIST.append(line.strip()) thread1=Site1Thread(sys.argv[1]) thread1.start() thread1.join() f=open(output_file,'w') forproxyinset(PROXY_LIST): f.write(proxy+"\n") f.close() print"Done!" exceptSystemExit: pass except: errMsg=traceback.format_exc() printerrMsg log.error(errMsg)