java爬取页面

xiaoxiao2021-02-28 57

一、所需环境

1、idea 2、maven 3.9 3、jdk 1.8 4、jsoup 1.7.3

二、环境搭建

1、首先分析页面的布局，我抓取的是csdn中文章内容，打开一篇文章

2、按f12查看布局，我这里只抓取下面的上一篇链接以及名字，查看页面的代码

3、可以看到我们要找到class为prev_article的，然后抓取里面的超链接，然后是超链接内的文字。

4、分析后就可以编码了。

5、在idea中新建一个maven项目，名为csdn，结构如下图

6、pom内容如下

<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.hjduan</groupId> <artifactId>csdn</artifactId> <version>1.0-SNAPSHOT</version> <packaging>jar</packaging> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <dependencies> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.7.3</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <version>3.3</version> <configuration> <source>1.8</source> <target>1.8</target> </configuration> </plugin> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-surefire-plugin</artifactId> <version>2.18.1</version> <configuration> <skipTests>true</skipTests> </configuration> </plugin> </plugins> </build> </project>

7、新建一个data类，用来存放抓取的内容

package com.hjduan.csdn.model; /** * Created with IntelliJ IDEA. * Description: 数据类 * 2017-08-05-22:34 */ public class Data { private String time; private String href; private String linkName; public Data() { } public Data(String time, String href, String linkName) { this.time = time; this.href = href; this.linkName = linkName; } public String getTime() { return time; } public void setTime(String time) { this.time = time; } public String getHref() { return href; } public void setHref(String href) { this.href = href; } public String getLinkName() { return linkName; } public void setLinkName(String linkName) { this.linkName = linkName; } public String toString() { return linkName+"-->"+href+"-->"+"在"+time+" 抓取"; } }

8、新建一个抓取规则类

package com.hjduan.csdn.rule; /** * Created with IntelliJ IDEA. * Description: 规则类 * 2017-08-05-22:40 */ public class Rule { /** * 链接 */ private String url; /** * 对返回的HTML进行过滤 */ private String resultTagName; /** * CLASS / ID / SELECTION * 设置过滤resultTagName的类型，默认为ID */ private int type = ID; /** * GET / POST * 请求的类型，默认GET */ private int requestMoethod = GET; public final static int GET = 0; public final static int POST = 1; public final static int CLASS = 0; public final static int ID = 1; public final static int SELECTION = 2; public Rule(String url, String resultTagName, int type, int requestMoethod) { this.url = url; this.resultTagName = resultTagName; this.type = type; this.requestMoethod = requestMoethod; } public Rule() { } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getResultTagName() { return resultTagName; } public void setResultTagName(String resultTagName) { this.resultTagName = resultTagName; } public int getType() { return type; } public void setType(int type) { this.type = type; } public int getRequestMoethod() { return requestMoethod; } public void setRequestMoethod(int requestMoethod) { this.requestMoethod = requestMoethod; } }

9、自定义规则异常类

package com.hjduan.csdn.rule; /** * Created with IntelliJ IDEA. * Description: 自定义异常类 * 2017-08-05-22:44 */ public class RuleException extends RuntimeException { public RuleException() { super(); } public RuleException(String message) { super(message); } public RuleException(String message, Throwable cause) { super(message, cause); } public RuleException(Throwable cause) { super(cause); } }

10、核心抓取类

package com.hjduan.csdn.service; import com.hjduan.csdn.model.Data; import com.hjduan.csdn.rule.Rule; import com.hjduan.csdn.rule.RuleException; import com.hjduan.csdn.utils.DateUtil; import com.hjduan.csdn.utils.StringUtil; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.io.IOException; import java.util.ArrayList; import java.util.List; /** * Created with IntelliJ IDEA. * Description: 抓取核心类 * 2017-08-05-22:47 */ public class GrabService { public static List<Data> extract(Rule rule) { // 进行对rule的必要校验 validateRule(rule); List<Data> datas = new ArrayList<Data>(); Data data = null; try { /** * 解析rule */ String url = rule.getUrl(); String resultTagName = rule.getResultTagName(); int type = rule.getType(); int requestType = rule.getRequestMoethod(); Connection conn = Jsoup.connect(url); // 设置请求类型 Document doc = null; switch (requestType) { case Rule.GET: doc = conn.timeout(100000).get(); break; case Rule.POST: doc = conn.timeout(100000).post(); break; } //处理返回数据 Elements results = new Elements(); switch (type) { case Rule.CLASS: results = doc.getElementsByClass(resultTagName); break; case Rule.ID: Element result = doc.getElementById(resultTagName); results.add(result); break; case Rule.SELECTION: results = doc.select(resultTagName); break; default: //当resultTagName为空时默认去body标签 if (StringUtil.isEmpty(resultTagName)) { results = doc.getElementsByTag("body"); } } for (Element result : results) { Elements links = result.getElementsByTag("a"); for (Element link : links) { //必要的筛选 String linkHref = link.attr("href"); String linkText = link.text(); data = new Data(); data.setHref(linkHref); data.setLinkName(linkText); data.setTime(DateUtil.getFormatDate()); datas.add(data); } } } catch (IOException e) { e.printStackTrace(); } return datas; } /** * 对传入的参数进行必要的校验 */ private static void validateRule(Rule rule) { String url = rule.getUrl(); if (StringUtil.isEmpty(url)) { throw new RuleException("url不能为空！"); } if (!url.startsWith("http://blog.csdn.net/")) { throw new RuleException("url的格式不正确！"); } } }

11、测试代码

package com.hjduan.csdn.test; import com.hjduan.csdn.service.GrabService; import com.hjduan.csdn.model.Data; import com.hjduan.csdn.rule.Rule; import com.hjduan.csdn.utils.StringUtil; import java.util.List; /** * Created with IntelliJ IDEA. * Description: * 2017-08-05-23:40 */ public class TestGrab { /** * 按照clss来抓去 */ public void getGrab(Rule rule) { List<Data> datas = GrabService.extract(rule); for (Data data : datas) { print(data); if (!StringUtil.isEmpty(data.getHref())) { rule.setUrl(data.getHref()); getGrab(rule); } } } public void print(Data data) { System.out.println("<===============================================================" + "===============================================================" + "===============================================================>"); System.out.println(data); } public static void main(String args[]) { Rule rule = new Rule("http://blog.csdn.net/two_people/article/details/76783943", "prev_article", Rule.CLASS, Rule.GET); TestGrab testGrab = new TestGrab(); testGrab.getGrab(rule); } }

12、辅助类

package com.hjduan.csdn.utils; /** * Created with IntelliJ IDEA. * Description: 字符串辅助类 * 2017-08-05-22:46 */ public class StringUtil { public static boolean isEmpty(String str) { if (str == null || str.trim().length() == 0) { return true; } return false; } } package com.hjduan.csdn.utils; import java.text.SimpleDateFormat; import java.util.Date; /** * Created with IntelliJ IDEA. * Description: * 2017-08-05-23:34 */ public class DateUtil { public static String getFormatDate(){ Date date=new Date(); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); // public final String format(Date date) String sdate= sdf.format(date); return sdate; } }

13、源码地址

https://gitee.com/lgr123/grab

转载请注明原文地址: https://www.6miu.com/read-82893.html

技术

最新回复(0)