部分成功的Java图片抓取

xiaoxiao2021-02-28 46

非常草草的实现，能够完美实现对ecnu主页图片的抓取【跑，其他的都会报错或有缺失。

究其原因还是在对img标签上的解析上碰到了非常大的困难，正则表达式也不顶用，下一步考虑使用jsoup进行处理。

同时，对于图片的读写也较慢，可能与我是一字节一字节写入文件的有关，可以考虑使用imagio包进行处理。

还有一点可以改进的是对异常的处理，这个等看了coreJava相关章节再说，不急。

初步先实现给定网址，爬取图片至指定路径。之后考虑使用邮件发送（似乎不实用）

最终目标是架在服务器上，前端接收用户要求，后端进行爬取处理。

************************************************************************************************************

下载器类

package just4test; import java.net.*; import java.io.*; import java.util.*; import java.util.regex.*; public class Downloader { private Queue <String> col;//用队列，更加真实 private String base,path; public Downloader (String abase,String apath) { base=abase; path=apath; col = new LinkedList<>();//不能直接构造queue对象，用LinkedList作为其实现类，优先队列也可 } /* * @return html代码 * 获取网页html，这里默认utf8编码 */ public String getHtml() throws Exception { URL url = new URL(base); HttpURLConnection connection= (HttpURLConnection)url.openConnection(); String res=""; Scanner cin = new Scanner (connection.getInputStream()); while (cin.hasNextLine())//一行一行读入 res += cin.nextLine()+'\n'; cin.close(); return res; } /* * 辅助函数，从html中筛出图片，放进队列中,更好的方法是使用jsoup，等学习了再用 */ public void filter () throws Exception { String res = this.getHtml(); String homepage = base; int pos0= base.indexOf("//"); int pos1 = base.indexOf('/',pos0+2); if (pos1!=-1) homepage = base.substring(0,pos1); Pattern p = Pattern.compile("<img.*src=\"(.*?)\"");//正则表达式捕获图片url Matcher m = p.matcher(res); while (m.find()){//每找到一次图片就放进队列中 String tmp = m.group(1); if (tmp.indexOf('.')!=-1) //是图片一定得有dot if (tmp.substring(0,2).equals("//")) col.add("http:"+tmp); else if (tmp.charAt(0)=='/') col.add(homepage+tmp); else col.add(tmp); } } public Queue<String> getCol() { return col; } /* * 下载图片至指定路径，这里用的是最朴素的方法，可以用花哨点的imageIO，一会再研究 * 如果文件同名会自动覆盖，需修正 */ public void download()throws Exception { filter(); File fp = new File(path); if (!fp.exists()) fp.mkdirs();//如果该路径不存在，创建它 int i=0; InputStream cin = null; OutputStream cout = null; while (col.size()!=0) {//遍历整个队列 String tmp = col.remove(); String filename = tmp.substring(tmp.lastIndexOf('/'));//找出文件名 URL url = new URL(tmp); HttpURLConnection connection = (HttpURLConnection)url.openConnection(); cin = connection.getInputStream(); File file =new File (fp+filename);//这里默认fp最后不带/，文件名前面带/ cout = new FileOutputStream (file);//输出流 while ((i=cin.read())!=-1) //将其当作普通二进制文件进行处理，read到头时返回-1 cout.write(i);//写入文件 cin.close();//关闭输入输出流 cout.close(); } } }

测试类

package just4test; import java.util.*; import java.io.*; public class Main { public static void main(String[] args) { try { Scanner cin = new Scanner (System.in); System.out.println("输入要抓取的网页"); String base = cin.next(); System.out.println("输入存放路径"); String path = cin.next(); cin.close(); Downloader down = new Downloader(base,path); /*System.out.println(down.getHtml()); down.filter(); Queue<String> que = down.getCol(); for (String z:que) System.out.println(z);*/ down.download(); System.out.println("下载成功！"); } catch (FileNotFoundException e) {//关于异常的代码写得真的很偷懒，下次补上 e.printStackTrace(); System.out.println("小错误，不慌"); } catch(Exception e) { e.printStackTrace(); System.out.println("下载失败"); } } }

以上

转载请注明原文地址: https://www.6miu.com/read-2622343.html

技术

最新回复(0)