使用了jsoup这一第三方包,解析HTML效果拔群,同时使用了缓冲区来进行输入输出,提升效率
下载器类
package just4test2; import java.io.*; import java.net.*; import org.jsoup.*; import org.jsoup.nodes.*; import org.jsoup.select.*; public class Downloader { private String base,path; public Downloader(String abase,String apath) { base=abase; path=apath; } public void download(String src) throws Exception { File fp = new File (path); if (!fp.exists()) fp.mkdirs(); int pos = src.lastIndexOf('/'); String filename = src.substring(pos); URL url = new URL(src); InputStream cin = url.openStream();//开启连接,同时返回输入流对象 FileOutputStream cout = new FileOutputStream (path+filename);//得到文件输出流对象 int size=0; byte[] buffer = new byte[1024];//建立缓冲区,存放1kb的数据,再一起写入文件,提高效率 while ((size=cin.read(buffer))!=-1) cout.write(buffer,0,size);//从头开始写入 cin.close(); cout.close(); } public void getPic() throws Exception{ Connection con = Jsoup.connect(base);//用jsoup获取连接 Document doc = con.get();//得到document对象 // 查找所有img标签 Elements imgs = doc.getElementsByTag("img");//根据img标签抓元素,得到一个element元素集elements,很形象 int j=1; for (Element x:imgs) { String imgSrc = x.attr("abs:src");//这步不懂 imgSrc = imgSrc.replaceAll("\\s","");//把空白符替换掉 System.out.printf("正在下载第%d个文件",j++); System.out.print(",地址:"); System.out.println(imgSrc); download(imgSrc); } } }主类
package just4test2; import java.util.*; public class Main { public static void main(String[] args) { try { Scanner cin = new Scanner (System.in); System.out.println("输入要抓取的网页"); String base = cin.next(); System.out.println("输入存放路径"); String path = cin.next(); cin.close(); Downloader down = new Downloader(base,path); down.getPic(); System.out.println("下载成功"); } catch (Exception e) { e.printStackTrace(); System.out.println("GG"); } } }稳定性大幅提升,当然还不够完善,比如可以设置响应时间,多线程(这个估计要很久才会实现)遇到错误跳过等等,目前水平到这里差不多了,接下来补习一下前端知识,对jsoup能有个透彻点的了解。
