Java抓取网页爬虫

xiaoxiao2021-02-28 87

import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import org.apache.commons.httpclient.Header; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.NameValuePair; import org.apache.commons.httpclient.methods.PostMethod; public class RetrivePage { private static HttpClient httpClient=new HttpClient(); //设置代理服务器 static{ //设置代理服务器的IP地址和端口 //httpClient.getHostConfiguration().setProxy("192.168.28.137", 8080); } public static boolean downloadPage(String path) throws HttpException, IOException{ InputStream input=null; OutputStream output=null; //得到post方法 PostMethod postMethod=new PostMethod(path); //设置post方法的参数 /*NameValuePair[] postData=new NameValuePair[2]; postData[0]=new NameValuePair("name","lietu"); postData[1]=new NameValuePair("password","*****"); postMethod.addParameters(postData);*/ //执行，返回状态码 int statusCode=httpClient.executeMethod(postMethod); //针对状态码进行处理（简单期间，只处理返回值为200的状态码） if(statusCode==HttpStatus.SC_OK){ input=postMethod.getResponseBodyAsStream(); //得到文件名 //String filename="G://pachon//"+path.substring(path.lastIndexOf('/')+1); String filename="G:\\pachon\\"+"hello.html"; System.out.println("filtname="+filename); //获得文件输出流 output=new FileOutputStream(filename); //输出到文件 int tempBytes=-1; while((tempBytes=input.read())>0){ output.write(tempBytes); } //关闭输入输出流 if(input!=null){ input.close(); } if(output!=null){ output.close(); } return true; } //若需要转向，则进行转向操作 if((statusCode==HttpStatus.SC_MOVED_TEMPORARILY)|| (statusCode==HttpStatus.SC_MOVED_PERMANENTLY)|| (statusCode==HttpStatus.SC_TEMPORARY_REDIRECT)|| statusCode==HttpStatus.SC_SEE_OTHER){ //获取新的URL地址 Header header=postMethod.getResponseHeader("location"); if(header==null){ String newUrl=header.getValue(); if(newUrl==null||newUrl.equals("")){ newUrl="/"; //使用post转向 PostMethod redirect=new PostMethod(newUrl); //发送请求，做进一步处理。。。。 } } } return false; } //测试代码 public static void main(String[] args) { //抓取猎兔首页，输出 try { RetrivePage.downloadPage("http://www.lietuw.com/"); System.out.println("执行成功!"); } catch (HttpException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } }

转载请注明原文地址: https://www.6miu.com/read-77551.html

技术

最新回复(0)