nodejs通过phantomjs实现下载网页
功能其实很见简单,通过phantomjs.exe采集url加载的资源,通过子进程的方式,启动nodejs加载所有的资源,对于css的资源,匹配css内容,下载里面的url资源
当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下
首先当然是下载nodejs和phantomjs
下面是phantomjs.exe执行的down.js
varpage=require('webpage').create(), system=require('system'); varspawn=require("child_process").spawn if(system.args.length===1){ console.log('Usage:netsniff.js<someURL>'); phantom.exit(1); }else{ varurls=[]; page.address=system.args[1]; page.onResourceReceived=function(res){ if(res.stage==='start'){ urls.push(res.url); } }; page.open(page.address,function(status){ varhar; if(status!=='success'){ console.log('FAILtoloadtheaddress'); phantom.exit(1); }else{ console.log('downresource'+urls.length+'urls.'); varchild=spawn("node",["--harmony","downHtml.js",urls.join(',')]) child.stdout.on("data",function(data){ console.log(data); }) child.stderr.on("data",function(data){ console.log(data); }) child.on("exit",function(code){ phantom.exit(); }) } }); }
下面是对应的node运行的downHtml.js
"usestrict"; varfs=require('fs'); varhttp=require('http'); varpath=require('path'); varr_url=require('url'); vardirCache={};//缓存减少判断 functionmakedir(pathStr,callback){ if(dirCache[pathStr]==1){ callback(); }else{ fs.exists(pathStr,function(exists){ if(exists==true){ dirCache[pathStr]==1; callback(); }else{ makedir(path.dirname(pathStr),function(){ fs.mkdir(pathStr,function(){ dirCache[pathStr]==1; callback(); }) }); } }) } }; varreg=/[:,]\s*url\(['"]?.*?(\1)\)/g varreg2=/\((['"]?)(.*?)(\1)\)/ varisDownMap={}; vardownImgFromCss=function(URL){ http.get(URL,function(res){ //console.log(path.resolve(process.cwd(),'index.min.css')) //res.pipe(fs.createWriteStream(path.resolve(process.cwd(),'index.min.css'))); varbody=""; res.setEncoding('utf8'); res.on('data',function(chunk){ body+=chunk; }); res.on('end',function(){ varmatch=body.match(reg); for(vari=0,len=match.length;i<len;i++){ varm=match[i].match(reg2); if(m&&m[2]){ varurl=m[2]; letimgUrl=r_url.resolve(URL,url); if(!isDownMap[imgUrl]){ varuo=r_url.parse(imgUrl); letfilepath=CWD+'/'+uo.hostname+uo.pathname; makedir(path.dirname(filepath),function(){ http.get(imgUrl,function(res){ res.pipe(fs.createWriteStream(filepath)); }) }) isDownMap[imgUrl]=1; } } } }); }); } varURLS=process.argv[2].split(','); varCWD=process.cwd(); //下载资源 URLS.forEach(function(URL){ varuo=r_url.parse(URL); varfilepath; if(uo.pathname=='/'||uo.pathname==''){ filepath=CWD+'/'+uo.hostname+'/index.html'; }else{ filepath=CWD+'/'+uo.hostname+uo.pathname; } makedir(path.dirname(filepath),function(){ http.get(URL,function(res){ if(URL.indexOf('.css')!=-1||(res.headers["content-type"]&&res.headers["content-type"].indexOf('text/css')!=-1)){ console.log('downimagesformcssfile:'+URL+'.'); downImgFromCss(URL); } res.pipe(fs.createWriteStream(filepath)); }) }); });
down.jsdownHtml.js放在同一个文件夹下通过下列cmd运行
D:\phantomjs-2.0.0-windows\bin\phantomjs.exedown.jshttp://www.youku.com/
以上所述就是本文的全部内容了,希望大家能够喜欢。