Java图片抓取2.0

xiaoxiao2021-02-28 60

使用了jsoup这一第三方包，解析HTML效果拔群，同时使用了缓冲区来进行输入输出，提升效率

下载器类

package just4test2; import java.io.*; import java.net.*; import org.jsoup.*; import org.jsoup.nodes.*; import org.jsoup.select.*; public class Downloader { private String base,path; public Downloader(String abase,String apath) { base=abase; path=apath; } public void download(String src) throws Exception { File fp = new File (path); if (!fp.exists()) fp.mkdirs(); int pos = src.lastIndexOf('/'); String filename = src.substring(pos); URL url = new URL(src); InputStream cin = url.openStream();//开启连接，同时返回输入流对象 FileOutputStream cout = new FileOutputStream (path+filename);//得到文件输出流对象 int size=0; byte[] buffer = new byte[1024];//建立缓冲区，存放1kb的数据，再一起写入文件，提高效率 while ((size=cin.read(buffer))!=-1) cout.write(buffer,0,size);//从头开始写入 cin.close(); cout.close(); } public void getPic() throws Exception{ Connection con = Jsoup.connect(base);//用jsoup获取连接 Document doc = con.get();//得到document对象 // 查找所有img标签 Elements imgs = doc.getElementsByTag("img");//根据img标签抓元素，得到一个element元素集elements，很形象 int j=1; for (Element x:imgs) { String imgSrc = x.attr("abs:src");//这步不懂 imgSrc = imgSrc.replaceAll("\\s","");//把空白符替换掉 System.out.printf("正在下载第%d个文件",j++); System.out.print("，地址："); System.out.println(imgSrc); download(imgSrc); } } }

主类

package just4test2; import java.util.*; public class Main { public static void main(String[] args) { try { Scanner cin = new Scanner (System.in); System.out.println("输入要抓取的网页"); String base = cin.next(); System.out.println("输入存放路径"); String path = cin.next(); cin.close(); Downloader down = new Downloader(base,path); down.getPic(); System.out.println("下载成功"); } catch (Exception e) { e.printStackTrace(); System.out.println("GG"); } } }

稳定性大幅提升，当然还不够完善，比如可以设置响应时间，多线程（这个估计要很久才会实现）遇到错误跳过等等，目前水平到这里差不多了，接下来补习一下前端知识，对jsoup能有个透彻点的了解。

转载请注明原文地址: https://www.6miu.com/read-2626763.html

技术

最新回复(0)