Java数据爬虫程序jsoup

xiaoxiao2021-02-28  39

1、导入包

<dependency> <!-- jsoup HTML parser library @ https://jsoup.org/ --> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.11.2</version> </dependency>

2、程序应用

package com.zemel.pc; import java.util.ArrayList; import java.util.List; public class Test2 { /** * * @author Administrator * @time 2018-4-8 上午10:15:29 * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub List<String> proList = new ArrayList<String>(); proList.add("http://yz.chsi.com.cn/zsml/kskm.jsp?id=1000121023010107061"); // 读取 for(String prof : proList){ Rule rule = new Rule(prof, new String[]{}, new String[]{}, "table.zsml-condition", Rule.SELECTION, Rule.GET); String str = ExtractService.extract(rule).toString(); String code = prof.substring(prof.indexOf("id=")+3); str = str + " " + code.substring(code.length() - 9, code.length()-3); System.out.println(str); } } }

3、Rule.java

package com.zemel.pc; public class Rule { /** 链接 */ private String url; /** * 参数集合 */ private String[] params; /** * 参数对应的值 */ private String[] values; /** * 对返回的HTML,第一次过滤所用的标签,请先设置type */ private String resultTagName; /** * CLASS / ID / SELECTION * 设置resultTagName的类型,默认为ID */ private int type = ID ; private String code; /** *GET / POST * 请求的类型,默认GET */ private int requestMoethod = GET ; public final static int GET = 0 ; public final static int POST = 1 ; public final static int CLASS = 0; public final static int ID = 1; public final static int SELECTION = 2; public Rule() { } public Rule(String url, String[] params, String[] values, String resultTagName, int type, int requestMoethod) { super(); this.url = url; this.params = params; this.values = values; this.resultTagName = resultTagName; this.type = type; this.requestMoethod = requestMoethod; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String[] getParams() { return params; } public void setParams(String[] params) { this.params = params; } public String[] getValues() { return values; } public void setValues(String[] values) { this.values = values; } public String getResultTagName() { return resultTagName; } public void setResultTagName(String resultTagName) { this.resultTagName = resultTagName; } public int getType() { return type; } public void setType(int type) { this.type = type; } public int getRequestMoethod() { return requestMoethod; } public void setRequestMoethod(int requestMoethod) { this.requestMoethod = requestMoethod; } public String getCode() { return code; } public void setCode(String code) { this.code = code; } }

4、ExtractService.java

package com.zemel.pc; import java.io.IOException; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.helper.StringUtil; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class ExtractService { /** * @param rule * @return */ public static StringBuffer extract(Rule rule) { // 进行对rule的必要校验 validateRule(rule); StringBuffer sb = new StringBuffer(); try { /** * 解析rule */ String url = rule.getUrl(); String[] params = rule.getParams(); String[] values = rule.getValues(); String resultTagName = rule.getResultTagName(); int type = rule.getType(); int requestType = rule.getRequestMoethod(); Connection conn = Jsoup.connect(url); // 设置查询参数 if (params != null) { for (int i = 0; i < params.length; i++) { conn.data(params[i], values[i]); } } // 设置请求类型 Document doc = null; switch (requestType) { case Rule.GET : doc = conn.timeout(100000).get(); break; case Rule.POST : doc = conn.timeout(100000).post(); break; } // 处理返回数据 Elements results = new Elements(); results = getResults(type, resultTagName, doc); for (Element result : results) { Elements links = result.getElementsByTag("td"); for (Element link : links) { String text = link.html(); Element preElement = link.previousElementSibling(); if(preElement != null){ String preText = link.previousElementSibling().html(); if("指导老师:".equals(preText.trim())){ continue; } } if(!text.contains("<span") && !text.contains("target")){ if(text != null && !text.contains(":") && !"不区分导师".equals(text.trim()) ){ sb.append(text).append(" "); }else{ String[] ss = text.split(":"); if(ss.length > 1){ String s2 = ""; if(ss.length > 2){ s2 = ss[2]; } sb.append(ss[1].split(",")[0]+ " " + s2); } } } } } } catch (IOException e) { e.printStackTrace(); } return sb; } /** * 对传入的参数进行必要的校验 */ private static void validateRule(Rule rule) { String url = rule.getUrl(); if (StringUtil.isBlank(url)) { throw new RuleException("url不能为空!"); } if (!url.startsWith("http://")) { throw new RuleException("url的格式不正确!"); } if (rule.getParams() != null && rule.getValues() != null) { if (rule.getParams().length != rule.getValues().length) { throw new RuleException("参数的键值对个数不匹配!"); } } } private static Elements getResults(int type, String resultTagName, Document doc){ Elements results = new Elements(); switch (type) { case Rule.CLASS : results = doc.getElementsByClass(resultTagName); break; case Rule.ID : Element result = doc.getElementById(resultTagName); results.add(result); break; case Rule.SELECTION : results = doc.select(resultTagName); break; default : // 当resultTagName为空时默认去body标签 if (StringUtil.isBlank(resultTagName)) { results = doc.getElementsByTag("body"); } } return results; } }

转载请注明原文地址: https://www.6miu.com/read-2623342.html

最新回复(0)