使用HttpClient和Jsoup定向抓取数据

xiaoxiao2021-02-28  113

1.业务需求:

从指定外网抓点货,冷启动

2.站点分析:

.限制IP… .需要登录…… .对登录账号有抓取频率限制………. .抓取频率过低,直接跳验证码页面………….. .验证码长度、模样(纯数字&字母数字混合)TM不固定………………..

“我们能不能不抓了?“ “不行!必须得抓…” “……”

这么说,此前写的爬虫,多线程、生产者—>消费者 并发抓取压根行不通。多线程毫无意义。

3.使用技术:

1.HttpClient:读取指定URL网页内容 2.Jsoup:解析所要的页面数据——省得写恶心的正则表达式 3.Swing:绘制用户操作界面 4.Tess4J:自动识别验证码(http://tess4j.sourceforge.net/) 5.Exe4J:生成可独立运行的exe程序——给每人机器安装一个,大家一起监控抓~

4.实现要点:

1.代理IP 从一些网站上抓取代理IP,并检测是否可以使用,如下:

package com.ydj.zhuaqu.proxy; import java.io.IOException; import java.net.InetSocketAddress; import java.net.Socket; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Random; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; import net.sf.json.JSONArray; import net.sf.json.JSONObject; import org.apache.commons.collections.map.LRUMap; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.client.config.RequestConfig; import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.CloseableHttpClient; import org.apache.http.impl.client.HttpClientBuilder; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import com.ydj.common.kit.MyLog; /** * * @author : Ares.yi * @createTime : 2014-11-10 上午11:13:42 * @version : 1.0 * @description : * */ public class ProxyIpPool { /**设置最多IP数*/ private static final int MAX_IP = 100; /**设置最少IP数(最好控制和外部使用线程数一致)*/ @SuppressWarnings("unused") private static final int MIN_IP = 10; // public static ConcurrentHashMap<Integer,Integer> canUseIPs = new ConcurrentHashMap<Integer,Integer>(); public static List<ProxyIp> canUseIpList = Collections.synchronizedList(new ArrayList<ProxyIp>(MAX_IP)); private static LRUMap notCanUseIPsTemp = new LRUMap(2000); /**每次抓取IP数*/ private static final int NUM = 20; private static final String ORDER_ID = "904557733280949"; private static final String KDL_URL = "http://dev.kuaidaili.com/api/getproxy?orderid="+ORDER_ID+"&num="+NUM+"&quality=1&an_ha=1&dedup=1&format=json"; private ProxyIpPool(){ } /** * 启动抓取代理IP线程 * * @author : Ares.yi * @createTime : 2015年10月29日 下午5:58:54 */ public static void startCrawl(){ final int period = 3; ScheduledExecutorService scheduledExecutorService = Executors.newScheduledThreadPool(1); scheduledExecutorService.scheduleAtFixedRate(new Runnable() { int i = 0 ; @Override public void run() { produceIP(i); i++; } }, 1, period,TimeUnit.MINUTES); } private static void produceIP(int i){ int currentSize = canUseIpList.size(); if( currentSize >= MAX_IP){ MyLog.logInfo(i+":current proxyPool size is:"+currentSize+",no need crawl new ip.NotCanUseIPsTemp size is:"+notCanUseIPsTemp.size()); return ; } JSONArray ips = getIPFromKuaiDaiLi(); produceIP(ips); MyLog.logInfo(i+":current proxyPool size is:"+canUseIpList.size()+",notCanUseIPsTemp size is:"+notCanUseIPsTemp.size()); } private static void produceIP(JSONArray ips){ if(ips == null || ips.isEmpty()){ return ; } for(int i = 0 ;i < ips.size() ;i++ ){ Object one = ips.get(i); String s[] = one.toString().split(":"); String ip = s[0]; int port = Integer.valueOf(s[1]); ProxyIp proxyIp = new ProxyIp(ip, port); if(isCanUse(ip, port)){ addIP(proxyIp); }else{ removeIP(proxyIp); } } } public static ProxyIp useOneProxyIp(){ if(canUseIpList.isEmpty()){ MyLog.logInfo(Thread.currentThread().getName()+" useOneProxyIp,but proxyPool is empty,need to wait 2 min crawl IP."); try { Thread.sleep(2 * 60 * 1000); } catch (InterruptedException e) { e.printStackTrace(); } } Collections.sort(canUseIpList); ProxyIp proxyIp = canUseIpList.remove(0); proxyIp.useThis(); return proxyIp; } public static void returnProxyIp(ProxyIp proxyIp){ proxyIp.setUseing(false); canUseIpList.add(proxyIp); return ; } /** * 从快代理网站获取代理IP * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:36:05 */ private static JSONArray getIPFromKuaiDaiLi(){ JSONArray ips = new JSONArray(); HttpClient client = new HttpClient(); GetMethod method = new GetMethod(KDL_URL); HttpMethodParams param = method.getParams(); param.setContentCharset("UTF-8"); try { client.executeMethod(method); String res = method.getResponseBodyAsString(); JSONObject json = JSONObject.fromObject(res); if(json != null && json.containsKey("data")){ ips = json.getJSONObject("data").getJSONArray("proxy_list"); MyLog.logInfo(ips); } } catch (Exception e) { e.printStackTrace(); } return ips; } /** * 从更多的网站获取代理IP * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:46:40 */ @SuppressWarnings("unused") private static JSONArray getIPFromXXX(){ JSONArray ips = new JSONArray(); HttpClient client = new HttpClient(); GetMethod method = new GetMethod("XXX"); HttpMethodParams param = method.getParams(); param.setContentCharset("UTF-8"); try { client.executeMethod(method); String res = method.getResponseBodyAsString(); JSONObject json = JSONObject.fromObject(res); if(json != null && json.containsKey("data")){ ips = json.getJSONObject("data").getJSONArray("proxy_list"); MyLog.logInfo(ips); } } catch (Exception e) { e.printStackTrace(); } return ips; } /** * 检测代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午2:37:22 */ private static boolean isCanUse(String ip,int port){ if(port < 0 ){ return false; } if(notCanUseIPsTemp.containsKey(ip)){ MyLog.logInfo(ip+":"+port+" can't use again."); return false; } if(!checkIp(ip, port)){ return false; } return checkIpUseTargetSite(ip, port); } /** * 检测代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午12:35:28 */ private static boolean checkIp(String ip,int port){ Socket server = null; try { server = new Socket(); InetSocketAddress address = new InetSocketAddress(ip,port); server.connect(address, 3000); MyLog.logInfo(ip+":"+port+" is ok!"); return true; }catch (UnknownHostException e) { //e.printStackTrace(); MyLog.logInfo(ip+":"+port+" is wrong!"); } catch (IOException e) { //e.printStackTrace(); MyLog.logInfo(ip+":"+port+" is wrong!!"); } return false; } /** * 到目标网站准确检测代理IP是否可用 * * @param ip * @param port * @return * * @author : Ares.yi * @createTime : 2015年10月29日 下午12:06:03 */ private static boolean checkIpUseTargetSite(String ip,int port){ HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); CloseableHttpClient closeableHttpClient = httpClientBuilder.build(); HttpHost proxy = new HttpHost(ip,port, "http"); RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build(); HttpGet httpGet = new HttpGet("http://www.autozi.com/partCategory.html/"); httpGet.setConfig(config); try { CloseableHttpResponse response = closeableHttpClient.execute(httpGet); HttpEntity httpentity = response.getEntity(); String html = EntityUtils.toString(httpentity, "UTF-8"); if(Jsoup.parse(html).select("div[class=header fix]").first() != null){ return true; } } catch (Exception exc){ // exc.printStackTrace(); MyLog.logError(exc.getMessage()); } return false; } public static void removeIP(ProxyIp proxyIp){ canUseIpList.remove(proxyIp); notCanUseIPsTemp.put(proxyIp.getIp(),proxyIp.getPort()); } public static void addIP(ProxyIp proxyIp){ canUseIpList.add(proxyIp); notCanUseIPsTemp.remove(proxyIp.getIp()); } /** * 测试使用代理IP * * @author : Ares.yi * @createTime : 2015年10月29日 下午6:00:16 */ private static void testUseProxyIp(){ ExecutorService threadPool = Executors.newFixedThreadPool(10); for(int i=0 ;i <20 ;i++){ final int flag = i; threadPool.execute(new Runnable() { @Override public void run() { ProxyIp proxyIp = useOneProxyIp(); MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" get proxyIp is : "+proxyIp.toString()); long millis = new Random().nextInt(10) * 1000; try { Thread.sleep(millis);//每个线程随机sleep N秒,模拟线程在工作 } catch (InterruptedException e) { e.printStackTrace(); } returnProxyIp(proxyIp); MyLog.logInfo(flag+" job "+Thread.currentThread().getName()+" use proxyIp is : "+proxyIp.toString()+",work use time "+millis+" end and return to pool."); } }); } } }

使用代理IP:

/** * 使用代理获取网页内容 * * @param url * @param proxyIp * @param proxyPort * @return * @throws ParseException * @throws IOException * * @author : Ares.yi * @createTime : 2015年10月30日 上午9:55:21 */ public static String getHtml(String url,String proxyIp,int proxyPort) throws ParseException, IOException { HttpClientBuilder httpClientBuilder = HttpClientBuilder.create(); CloseableHttpClient closeableHttpClient = httpClientBuilder.build(); HttpHost proxy = new HttpHost(proxyIp,proxyPort, "http"); RequestConfig config = RequestConfig.custom().setConnectTimeout(3000).setSocketTimeout(3000).setProxy(proxy).build(); HttpPost httpGet = new HttpPost(url); httpGet.setConfig(config); String html = ""; CloseableHttpResponse response = null; try { response = closeableHttpClient.execute(httpGet); }catch(Exception exc){ exc.printStackTrace(); System.out.println("get请求失败!"); return "cannot connect"; } HttpEntity httpEntity = response.getEntity(); if (httpEntity != null) { // 打印响应内容 try{ html = EntityUtils.toString(httpEntity, "UTF-8"); }catch(Exception excep){ System.out.println(url); } }else{ return "cannot connect"; } closeableHttpClient.close(); return html; }

2.模拟登录 提取登录Cookie和User-Agent:

代码片段,如下:

public static String postRequest(String url, Map<String, String> parameterMap, String charSet) throws UnsupportedEncodingException { CloseableHttpClient client = HttpClients.createDefault(); HttpPost httpPost = new HttpPost(url); UrlEncodedFormEntity postEntity = new UrlEncodedFormEntity(getParam(parameterMap), charSet); httpPost.setEntity(postEntity); httpPost.addHeader("HOST", "sec.1688.com"); httpPost.addHeader("User-Agent", Constant.userAgent); httpPost.addHeader("Accept","text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); httpPost.addHeader("Cookie", Constant.cookie); MyLog.logInfo("request line:" + httpPost.getRequestLine()); try { // 执行post请求 HttpResponse httpResponse = client.execute(httpPost); Header header = httpResponse.getFirstHeader("Location"); if (header != null && Toolbox.isNotEmpty(header.getValue())) { MyLog.logInfo("location:" + header.getValue()); return "SUCCESS"; } else { String html = printResponse(httpResponse); return html; } } catch (IOException e) { e.printStackTrace(); } finally { try { client.close(); } catch (IOException e) { } } return ""; }

3.验证码 获取输入验证码页面信息:

public static Ali1688CheckCodeFormData getCheckCodeFormData(String url,String checkCodePageHtml){ Ali1688CheckCodeFormData ali1688CheckCodeFormData= null; if(Toolbox.isEmptyString(checkCodePageHtml)){ return ali1688CheckCodeFormData; } Document doc = Jsoup.parse(checkCodePageHtml); String action = doc.select("input[name=action]").attr("value"); String event_submit_do_query = doc.select("input[name=event_submit_do_query]").attr("value"); String smPolicy = doc.select("input[name=smPolicy]").attr("value"); String smReturn = doc.select("input[name=smReturn]").attr("value"); String smApp = doc.select("input[name=smApp]").attr("value"); String smCharset = doc.select("input[name=smCharset]").attr("value"); String smTag = doc.select("input[name=smTag]").attr("value"); String smSign = doc.select("input[name=smSign]").attr("value"); String identity = doc.select("input[name=identity]").attr("value"); String captcha = doc.select("input[name=captcha]").attr("value"); String sessionid = doc.select("img[id=checkcodeImg]").attr("src"); sessionid = sessionid.substring(sessionid.indexOf("sessionid=")+10,sessionid.indexOf("&")); ali1688CheckCodeFormData = new Ali1688CheckCodeFormData(action, event_submit_do_query, smPolicy, smReturn, smApp, smCharset, smTag, smSign, identity, captcha, sessionid,url); return ali1688CheckCodeFormData; }

提交验证码:

public static String submitCheckCode(String checkcode) throws UnsupportedEncodingException, IOException{ String smApp = Constant.ali1688CheckCodeFormData.getSmApp(); String smPolicy = Constant.ali1688CheckCodeFormData.getSmPolicy(); String smCharset = Constant.ali1688CheckCodeFormData.getSmCharset(); String smTag = Constant.ali1688CheckCodeFormData.getSmTag(); String smReturn = Constant.ali1688CheckCodeFormData.getSmReturn(); String smSign = Constant.ali1688CheckCodeFormData.getSmSign(); String get = "smApp="+smApp+"&smPolicy="+smPolicy+"&smCharset="+smCharset+"&smTag="+smTag+"&smReturn="+smReturn+"&smSign="+smSign; try { get = java.net.URLEncoder.encode(get,"utf-8"); } catch (UnsupportedEncodingException e1) { } String formAction = "https://sec.1688.com/query.htm?"+get; Map<String,String> parameterMap = new HashMap<String,String>(); parameterMap.put("action", Constant.ali1688CheckCodeFormData.getAction()); parameterMap.put("event_submit_do_query", Constant.ali1688CheckCodeFormData.getEvent_submit_do_query()); parameterMap.put("smPolicy", smPolicy); parameterMap.put("smReturn", smReturn); parameterMap.put("smApp", smApp); parameterMap.put("smCharset", smCharset); parameterMap.put("smTag", smTag); parameterMap.put("smSign", smSign); parameterMap.put("identity", Constant.ali1688CheckCodeFormData.getIdentity()); parameterMap.put("captcha", Constant.ali1688CheckCodeFormData.getCaptcha()); parameterMap.put("checkcode", checkcode); String res = HttpKit.postRequest(formAction, parameterMap, "UTF-8"); if (Toolbox.isNotEmpty(res) && "SUCCESS".equals(res)) { return "SUCCESS"; }else{ String html = res; Constant.ali1688CheckCodeFormData = getCheckCodeFormData(smReturn,html); } return ""; }

4.exe4j操作:

5.部分界面:

6.源码:

https://github.com/Aresyi/simpleSpider

转载请注明原文地址: https://www.6miu.com/read-63648.html

最新回复(0)