nodejs通过phantomjs实现下载网页
功能其实很见简单,通过phantomjs.exe采集url加载的资源,通过子进程的方式,启动nodejs加载所有的资源,对于css的资源,匹配css内容,下载里面的url资源
当然功能还是很简单的,在响应式设计和异步加载的情况下,还是有很多资源没有能够下载,需要根据实际情况处理下
首先当然是下载nodejs和phantomjs
下面是phantomjs.exe执行的down.js
varpage=require('webpage').create(),
system=require('system');
varspawn=require("child_process").spawn
if(system.args.length===1){
console.log('Usage:netsniff.js<someURL>');
phantom.exit(1);
}else{
varurls=[];
page.address=system.args[1];
page.onResourceReceived=function(res){
if(res.stage==='start'){
urls.push(res.url);
}
};
page.open(page.address,function(status){
varhar;
if(status!=='success'){
console.log('FAILtoloadtheaddress');
phantom.exit(1);
}else{
console.log('downresource'+urls.length+'urls.');
varchild=spawn("node",["--harmony","downHtml.js",urls.join(',')])
child.stdout.on("data",function(data){
console.log(data);
})
child.stderr.on("data",function(data){
console.log(data);
})
child.on("exit",function(code){
phantom.exit();
})
}
});
}
下面是对应的node运行的downHtml.js
"usestrict";
varfs=require('fs');
varhttp=require('http');
varpath=require('path');
varr_url=require('url');
vardirCache={};//缓存减少判断
functionmakedir(pathStr,callback){
if(dirCache[pathStr]==1){
callback();
}else{
fs.exists(pathStr,function(exists){
if(exists==true){
dirCache[pathStr]==1;
callback();
}else{
makedir(path.dirname(pathStr),function(){
fs.mkdir(pathStr,function(){
dirCache[pathStr]==1;
callback();
})
});
}
})
}
};
varreg=/[:,]\s*url\(['"]?.*?(\1)\)/g
varreg2=/\((['"]?)(.*?)(\1)\)/
varisDownMap={};
vardownImgFromCss=function(URL){
http.get(URL,function(res){
//console.log(path.resolve(process.cwd(),'index.min.css'))
//res.pipe(fs.createWriteStream(path.resolve(process.cwd(),'index.min.css')));
varbody="";
res.setEncoding('utf8');
res.on('data',function(chunk){
body+=chunk;
});
res.on('end',function(){
varmatch=body.match(reg);
for(vari=0,len=match.length;i<len;i++){
varm=match[i].match(reg2);
if(m&&m[2]){
varurl=m[2];
letimgUrl=r_url.resolve(URL,url);
if(!isDownMap[imgUrl]){
varuo=r_url.parse(imgUrl);
letfilepath=CWD+'/'+uo.hostname+uo.pathname;
makedir(path.dirname(filepath),function(){
http.get(imgUrl,function(res){
res.pipe(fs.createWriteStream(filepath));
})
})
isDownMap[imgUrl]=1;
}
}
}
});
});
}
varURLS=process.argv[2].split(',');
varCWD=process.cwd();
//下载资源
URLS.forEach(function(URL){
varuo=r_url.parse(URL);
varfilepath;
if(uo.pathname=='/'||uo.pathname==''){
filepath=CWD+'/'+uo.hostname+'/index.html';
}else{
filepath=CWD+'/'+uo.hostname+uo.pathname;
}
makedir(path.dirname(filepath),function(){
http.get(URL,function(res){
if(URL.indexOf('.css')!=-1||(res.headers["content-type"]&&res.headers["content-type"].indexOf('text/css')!=-1)){
console.log('downimagesformcssfile:'+URL+'.');
downImgFromCss(URL);
}
res.pipe(fs.createWriteStream(filepath));
})
});
});
down.jsdownHtml.js放在同一个文件夹下通过下列cmd运行
D:\phantomjs-2.0.0-windows\bin\phantomjs.exedown.jshttp://www.youku.com/
以上所述就是本文的全部内容了,希望大家能够喜欢。