整个互联网可以看成是一个蜘蛛网,相互关联,可以感觉冒一个线索,找到其他的分支。搜索引擎主要包括这几个步骤:
1爬虫抓取互联网的网页。
2对网页内容进行分析
3对分析后的内容建立索引
4网页检索结果排序
5提供接口来交互。
1网页html源码的获取(WebHttpClient) 写道 package com.lucene.downpage; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.Socket; import java.net.UnknownHostException; public class WebHttpClient { /** * @param args */ public static void main(String[] args) { try { Socket webClient=new Socket("www.bnu.edu.cn",80); PrintWriter result=new PrintWriter(webClient.getOutputStream(),true); BufferedReader buff=new BufferedReader(new InputStreamReader(webClient.getInputStream())); result.println("GET / HTTP/1.1");//注意这里的格式,空格一定要写对,不然找不出来。。。。 result.println("Host:bnu.edu.cn"); result.println("Connection:Close"); result.println(); boolean bRet=true; StringBuffer sb=new StringBuffer(8096); while(bRet){ if(buff.ready()){ int idx=0; while(idx!=-1){ idx=buff.read(); sb.append((char)idx); } bRet=false; } } System.out.println(sb.toString()); webClient.close(); } catch (UnknownHostException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block System.err.println("下载失败。。。请检查输入的地址是否正确"); System.exit(1); } } }2保存到本地为HTML文件 写道 package com.lucene.downpage; import java.io.BufferedReader; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.PrintWriter; import java.net.Socket; import java.net.UnknownHostException; public class WebCrawler { /** * @param args */ private static String Text_File_path = "D:\\workshop\\ch2\\htmlsrc.html"; public static void main(String[] args) { try { File file = new File(Text_File_path); FileWriter fp = new FileWriter(file); /* 生成下载对象 */ Socket webClient = new Socket("www.bnu.edu.cn", 80); PrintWriter result = new PrintWriter(webClient.getOutputStream(), true); BufferedReader buff = new BufferedReader(new InputStreamReader( webClient.getInputStream())); /* 发送Http Request请求 */ result.println("GET / HTTP/1.1");// 注意这里的格式,空格一定要写对,不然找不出来。。。。 result.println("Host:bnu.edu.cn"); result.println("Connection:Close"); result.println(); boolean bRet = true; StringBuffer sb = new StringBuffer(8096); while (bRet) { if (buff.ready()) { int idx = 0; while (idx != -1) { idx = buff.read(); if (idx == '<') { break; } } while (idx != -1) { sb.append((char) idx); idx = buff.read(); } bRet = false; } } System.out.println(sb.toString()); fp.write(sb.toString()); webClient.close(); fp.close(); } catch (UnknownHostException e) { // TODO Auto-generated catch block System.err.println("无法访问主机"); System.exit(1); } catch (IOException e) { // TODO Auto-generated catch block System.err.println("下载失败。。。请检查输入的地址是否正确"); System.exit(1); } } } 分析本地网页内容,提取文本 写道 package com.lucene.downpage; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; public class WebParser { /** * @param args */ private static String src_File_Path="D:\\workshop\\ch2\\htmlsrc.html"; private static String dst_File_Path="D:\\workshop\\ch2\\puresrc.txt"; public static void main(String[] args) throws IOException { Parser(); } public static void Parser() throws IOException{ boolean bContent=true; StringBuffer st=new StringBuffer(8096*2); char[] cBuff=new char[8096*2]; int nCount=0; File srcFile=new File(src_File_Path); try { FileReader fpReader=new FileReader(srcFile); File dstFile=new File(dst_File_Path); FileWriter fpWriter=new FileWriter(dstFile); nCount=fpReader.read(cBuff); for(int i=0;i<nCount;i++){ if(bContent==false){ if(cBuff[i]=='>'){ bContent=true; }else{ continue; } } else{ if(cBuff[i]=='<'){ bContent=false; continue; }else if(cBuff[i]=='\n'||cBuff[i]==' '||cBuff[i]==' ' || cBuff[i]==' '){ continue; }else if(cBuff[i]=='&'&& cBuff[i+1]=='n'&& cBuff[i+2]=='b' && cBuff[i+3]=='s' && cBuff[i+4]=='p'&& cBuff[i+5]==';'){ i=i+5; continue; } st.append(cBuff[i]); fpWriter.write(cBuff[i]); } } System.out.println(st.toString()); fpReader.close(); fpWriter.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } 对提取的文本进行分词过滤 写道 package com.lucene.downpage; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; public class WebParseFilter { /** * @param args */ private static String src_File_Path="D:\\workshop\\ch2\\htmlsrc.html"; private static String dst_File_Path="D:\\workshop\\ch2\\puresrc.txt"; public static void main(String[] args) { try { ParseFilter(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void ParseFilter() throws IOException{ int j=0; boolean flag=true; boolean bContent=true; StringBuffer sBuffer=new StringBuffer(8096*2); char[] cBuffer=new char[8096*2]; char[] dstBuffer=new char[8096*2]; int nCount=0; File srcFile=new File(src_File_Path); try { FileReader fpReader=new FileReader(srcFile); File dstFile=new File(dst_File_Path); FileWriter fw=new FileWriter(dstFile); nCount=fpReader.read(cBuffer); for(int i=0;i<nCount;i++){ if(bContent==false){ if(cBuffer[i]=='>'){ bContent=true; }else{ continue; } }else{ if(cBuffer[i]=='<'){ bContent=false; continue; }else if(cBuffer[i]=='\n'||cBuffer[i]==' '){ continue; }else if(cBuffer[i]=='&'||cBuffer[i+1]=='n'||cBuffer[i+2]=='b'||cBuffer[i+3]=='s'||cBuffer[i+4]=='p'||cBuffer[i+5]==';'){ i=i+5; continue; } dstBuffer[j++]=cBuffer[i]; } } flag=true; for(int m=0;m<j;m++){ if((dstBuffer[m]<='Z'&&dstBuffer[m]>='A')||(dstBuffer[m]<='Z'&& dstBuffer[m]>='a')||(dstBuffer[m]<='9'&&dstBuffer[m]>='0')){ if(flag==false){ sBuffer.append(' '); } sBuffer.append(dstBuffer[m]); flag=true; }else{ if(dstBuffer[m]=='.'||dstBuffer[m]=='|'||dstBuffer[m]=='"'||dstBuffer[m]==':'||dstBuffer[m]==';'){ sBuffer.append(' '); continue; } if(flag==true){ sBuffer.append(' '); } sBuffer.append(dstBuffer[m]); sBuffer.append(' '); flag=false; } } System.out.println(sBuffer.toString()); fw.write(sBuffer.toString()); fpReader.close(); fw.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } 对分词后的文本建立索引 写道 package com.lucene.downpage; public class InfoItem { /** * @param args */ public int fileId; public int offset; InfoItem next; public InfoItem(){ fileId=0; offset=0; next=null; } public InfoItem(int id,int pos){ fileId=id; offset=pos; next=null; } public InfoItem getNext() { return next; } public void setNext(InfoItem next) { this.next = next; } public int getFileId() { return fileId; } public int getOffset() { return offset; } }
建立索引并查询 写道 package com.lucene.downpage; import java.util.Hashtable; public class WordIndex { /** * @param args */ static Hashtable keyWordIdx; static String[] FileList = {"北 京 师 范 大 学", "北 师 大 附 属 实 验 小 学", "北 师 大 第 二 附 属 中 学" }; public static void main(String[] args) { try { index(); search("北"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } public static void index() throws Exception{ InfoItem item,item2; System.out.println("index:==========begin================="); keyWordIdx=new Hashtable(); try{ System.out.println("index:Hash Table initial Size:"+keyWordIdx.size()); for(int i=0;i<3;i++){ int len=FileList[i].length(); for(int j=0;j<len;j++){ item=new InfoItem(i,j); String key=FileList[i].substring(j,j+1); System.out.print(key); if(!keyWordIdx.containsKey(key)){ keyWordIdx.put(key,item); }else{ item2=(InfoItem)keyWordIdx.get(key); item.setNext(item2); keyWordIdx.put(key, item); } } System.out.println(""); } System.out.println("index:Hash Table finish Size:"+keyWordIdx.size()); }catch(Exception ex){ throw ex; } System.out.println("====================end=================="); } public static void search(String keyWord) throws Exception{ InfoItem item; System.out.println("search:=============begin================"); if(null==keyWordIdx){ return ; } try{ item=(InfoItem)keyWordIdx.get(keyWord); while(item!=null){ System.out.println("Search:File number:"+item.getFileId()); System.out.println("Search:File offset:"+item.getOffset()); System.out.println("Search:File Content:"+FileList[item.getOffset()]); item=item.getNext(); } System.out.println("search:============end================"); }catch(Exception e){ throw e; } } }
