java实现登录之后抓取数据
最近做了一个从网络上抓取数据的一个小程序。主要关于信贷方面,收集的一些黑名单网站,从该网站上抓取到自己系统中。
也找了一些资料,觉得没有一个很好的,全面的例子。因此在这里做个笔记提醒自己。
首先需要一个jsoup的jar包,我用的1.6.0。。下载地址为:http://pan.baidu.com/s/1mgqOuHa
1,获取网页内容(核心代码,技术有限没封装)。
2,登录之后抓取网页数据(如何在请求中携带cookie)。
3,获取网站的ajax请求方法(返回json)。
以上这三点我就用一个类全部包含(比较糙望见谅,直接copy代码过去,应该就可以用)
一,这个类分别有这上面的1,2,3三中方法,直接main方法可以进行测试
packagecom.minxinloan.black.web.utils; importjava.io.BufferedReader; importjava.io.ByteArrayOutputStream; importjava.io.DataInputStream; importjava.io.DataOutputStream; importjava.io.File; importjava.io.FileOutputStream; importjava.io.FileWriter; importjava.io.IOException; importjava.io.InputStream; importjava.io.InputStreamReader; importjava.io.OutputStream; importjava.io.PrintWriter; importjava.net.HttpURLConnection; importjava.net.URL; importjava.net.URLConnection; importjava.net.URLEncoder; importjava.nio.charset.Charset; importjava.util.ArrayList; importjava.util.HashMap; importjava.util.Iterator; importjava.util.List; importjava.util.Map; importjava.util.Map.Entry; importjava.util.StringTokenizer; importnet.sf.json.JSONArray; importnet.sf.json.JSONObject; importorg.jsoup.Connection; importorg.jsoup.Connection.Method; importorg.jsoup.Jsoup; importorg.jsoup.nodes.Document; importorg.jsoup.nodes.Element; importorg.jsoup.select.Elements; publicclassCookieUtil{ publicfinalstaticStringCONTENT_TYPE="Content-Type"; publicstaticvoidmain(String[]args){ //StringloginURL="http://www.p2peye.com/member.php?mod=logging&action=login&loginsubmit=yes&loginhash=Lsc66&username=puqiuxiaomao&password=a1234567"; StringlistURL="http://www.p2peye.com/blacklist.php?p=2"; StringlogURL="http://www.p2peye.com/member.php"; //********************************需要登录的************************************************* try{ Connection.Responseres= Jsoup.connect(logURL) .data("mod","logging" ,"action","login" ,"loginsubmit","yes" ,"loginhash","Lsc66" ,"username","puqiuxiaomao" ,"password","a1234567") .method(Method.POST) .execute(); //这儿的SESSIONID需要根据要登录的目标网站设置的sessionCookie名字而定 Connectioncon=Jsoup.connect(listURL); //设置访问形式(电脑访问,手机访问):直接百度都参数设置 con.header("User-Agent","Mozilla/4.0(compatible;MSIE7.0;WindowsNT5.1)"); //把登录信息的cookies保存如map对象里面 Mapmap=res.cookies(); Iterator >it=map.entrySet().iterator(); while(it.hasNext()){ Entry en=it.next(); //把登录的信息放入请求里面 con=con.cookie(en.getKey(),en.getValue()); } //再次获取Document对象。 DocumentobjectDoc=con.get(); Elementselements=objectDoc.getAllElements();//获取这个连接返回页面的源码内容(不是源码跟源码差不多) for(Elementelement:elements){ //element是迭代出来的标签:如: Elementselements2=element.getAllElements();// for(Elementelement2:elements2){ element2.text(); element2.attr("href");//获取标签属性。element2代表a标签:href代表属性 element2.text();//获取标签文本 } } //********************************不需要登录的************************************************* StringURL="http://www.p2peye.com/blacklist.php?p=2"; DocumentconTemp=Jsoup.connect(URL).get(); ElementselementsTemps=conTemp.getAllElements(); for(ElementelementsTemp:elementsTemps){ elementsTemp.text(); elementsTemp.attr("href");//获取标签属性。element2代表a标签:href代表属性 elementsTemp.text();//获取标签文本 } //********************************ajax方法获取内容。。。*************************************************。 HttpURLConnectionconnection=null; BufferedReaderreader=null; try{ StringBuffersb=newStringBuffer(); URLgetUrl=newURL(URL); connection=(HttpURLConnection)getUrl.openConnection(); reader=newBufferedReader(newInputStreamReader( connection.getInputStream(),"utf-8")); Stringlines; while((lines=reader.readLine())!=null){ sb.append(lines); }; List
二,这个是获取验证码的类,可以研究下。(但你要要分析出网站的验证码的请求地址)
packagecom.minxinloan.black.web.utils; importjava.io.BufferedReader; importjava.io.DataInputStream; importjava.io.DataOutputStream; importjava.io.File; importjava.io.FileOutputStream; importjava.io.FileWriter; importjava.io.InputStream; importjava.io.InputStreamReader; importjava.io.PrintWriter; importjava.net.HttpURLConnection; importjava.net.URL; importjava.net.URLConnection; importjava.nio.charset.Charset; importjava.util.HashMap; importjava.util.List; importjava.util.Map; importjava.util.StringTokenizer; publicclassUtils{//解析验证码的 publicstaticContentgetRandom(Stringmethod,StringsUrl,//要解析的url MapparamMap,//存放用户名和密码的map Map requestHeaderMap,//存放COOKIE的map booleanisOnlyReturnHeader,Stringpath){ Contentcontent=null; HttpURLConnectionhttpUrlConnection=null; InputStreamin=null; try{ URLurl=newURL(sUrl); booleanisPost="POST".equals(method); if(method==null ||(!"GET".equalsIgnoreCase(method)&&!"POST" .equalsIgnoreCase(method))){ method="POST"; } URLresolvedURL=url; URLConnectionurlConnection=resolvedURL.openConnection(); httpUrlConnection=(HttpURLConnection)urlConnection; httpUrlConnection.setRequestMethod(method); httpUrlConnection.setRequestProperty("Accept-Language", "zh-cn,zh;q=0.5"); //Donotfollowredirects,Wewillhandleredirectsourself httpUrlConnection.setInstanceFollowRedirects(false); httpUrlConnection.setDoOutput(true); httpUrlConnection.setDoInput(true); httpUrlConnection.setConnectTimeout(5000); httpUrlConnection.setReadTimeout(5000); httpUrlConnection.setUseCaches(false); httpUrlConnection.setDefaultUseCaches(false); httpUrlConnection.connect(); intresponseCode=httpUrlConnection.getResponseCode(); if(responseCode==HttpURLConnection.HTTP_OK ||responseCode==HttpURLConnection.HTTP_CREATED){ byte[]bytes=newbyte[0]; if(!isOnlyReturnHeader){ DataInputStreamins=newDataInputStream( httpUrlConnection.getInputStream()); //验证码的位置 DataOutputStreamout=newDataOutputStream( newFileOutputStream(path+"/code.bmp")); byte[]buffer=newbyte[4096]; intcount=0; while((count=ins.read(buffer))>0){ out.write(buffer,0,count); } out.close(); ins.close(); } Stringencoding=null; if(encoding==null){ encoding=getEncodingFromContentType(httpUrlConnection .getHeaderField("")); } content=newContent(sUrl,newString(bytes,encoding), httpUrlConnection.getHeaderFields()); } }catch(Exceptione){ returnnull; }finally{ if(httpUrlConnection!=null){ httpUrlConnection.disconnect(); } } returncontent; } publicstaticStringgetEncodingFromContentType(StringcontentType){ Stringencoding=null; if(contentType==null){ returnnull; } StringTokenizertok=newStringTokenizer(contentType,";"); if(tok.hasMoreTokens()){ tok.nextToken(); while(tok.hasMoreTokens()){ Stringassignment=tok.nextToken().trim(); inteqIdx=assignment.indexOf('='); if(eqIdx!=-1){ StringvarName=assignment.substring(0,eqIdx).trim(); if("charset".equalsIgnoreCase(varName)){ StringvarValue=assignment.substring(eqIdx+1) .trim(); if(varValue.startsWith("\"") &&varValue.endsWith("\"")){ //substringworksonindices varValue=varValue.substring(1, varValue.length()-1); } if(Charset.isSupported(varValue)){ encoding=varValue; } } } } } if(encoding==null){ return"UTF-8"; } returnencoding; } //这个是输出 publicstaticbooleaninFile(Stringcontent,Stringpath){ PrintWriterout=null; Filefile=newFile(path); try{ if(!file.exists()){ file.createNewFile(); } out=newPrintWriter(newFileWriter(file)); out.write(content); out.flush(); returntrue; }catch(Exceptione){ e.printStackTrace(); }finally{ out.close(); } returnfalse; } publicstaticStringgetHtmlReadLine(Stringhttpurl){ StringCurrentLine=""; StringTotalString=""; InputStreamurlStream; Stringcontent=""; try{ URLurl=newURL(httpurl); HttpURLConnectionconnection=(HttpURLConnection)url .openConnection(); connection.connect(); System.out.println(connection.getResponseCode()); urlStream=connection.getInputStream(); BufferedReaderreader=newBufferedReader( newInputStreamReader(urlStream,"utf-8")); while((CurrentLine=reader.readLine())!=null){ TotalString+=CurrentLine+"\n"; } content=TotalString; }catch(Exceptione){ } returncontent; } } classContent{ privateStringurl; privateStringbody; privateMap >m_mHeaders=newHashMap >(); publicContent(Stringurl,Stringbody,Map >headers){ this.url=url; this.body=body; this.m_mHeaders=headers; } publicStringgetUrl(){ returnurl; } publicStringgetBody(){ returnbody; } publicMap >getHeaders(){ returnm_mHeaders; } }