在Luence搜索引擎中必须得到文件的InputStream的流对象的同时解析文件流中的信息:可以使用的集中组件:nokeHTML解析和HTMLParser解析。所以分别使用两个组件做解析比较结果
下面是nokeHTML的解析测试类:
package com.unutrip.remoting.ws;
import java.io.BufferedReader;import java.io.ByteArrayInputStream;import java.io.File;import java.io.FileReader;import java.io.IOException;import java.io.InputStream;import java.io.UnsupportedEncodingException;
import org.apache.html.dom.HTMLDocumentImpl;import org.cyberneko.html.parsers.DOMFragmentParser;import org.w3c.dom.DocumentFragment;import org.w3c.dom.Element;import org.w3c.dom.Node;import org.w3c.dom.NodeList;import org.xml.sax.InputSource;import org.xml.sax.SAXException;
/** * 使用nekohtml解析HTML文件 * * @author longgangbai * */public class HTMLParser {
/** * 从html中抽取纯文本 * * @param content * @return * @throws UnsupportedEncodingException */ public static String extractTextFromHTML(String content) throws UnsupportedEncodingException { DOMFragmentParser parser = new DOMFragmentParser(); DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); InputStream is = new ByteArrayInputStream(content.getBytes()); try { parser.parse(new InputSource(is), node); } catch (IOException e) { e.printStackTrace(); } catch (SAXException se) { se.printStackTrace(); }
StringBuffer newContent = new StringBuffer(); getText(newContent, node);
String str = (new String(newContent.toString().getBytes("ISO-8859-1"), "UTF-8")); return str; }
private static void getText(StringBuffer sb, Node node) { if (node.getNodeType() == Node.TEXT_NODE) { sb.append(node.getNodeValue()); } if (node.getNodeType() == Node.ELEMENT_NODE) { Element elmt = (Element) node; // 抛弃脚本 if ((elmt.getTagName().equals("STYLE") || elmt.getTagName().equals( "SCRIPT"))) { sb.append(""); } }
NodeList children = node.getChildNodes(); if (children != null) { int len = children.getLength(); for (int i = 0; i < len; i++) { getText(sb, children.item(i)); } } }
public static String getHtmlContext(String htmlPath) throws Exception { BufferedReader br = new BufferedReader(new FileReader( new File(htmlPath))); StringBuilder sb = new StringBuilder(); String tmp = null;
while ((tmp = br.readLine()) != null) { sb.append(tmp); } String context = extractTextFromHTML(sb.toString()); System.out.println("context" + context); return context; }
public static void main(String[] args) { try { getHtmlContext("D://fy_choice.html"); } catch (Exception e) { e.printStackTrace(); } }}
解析效果不是很好,同时需要xerces.jar支持,部分HTML信息解析带有有乱码信息?不可识别不爽呀?
相关资源:Java 面经手册·小傅哥(公众号:bugstack虫洞栈).pdf