nutch0.9中的摘要提取机制源码分析

xiaoxiao2026-05-28  3

**     * Low level api to get the most relevant (formatted) sections of the document.     * This method has been made public to allow visibility of score information held in TextFragment objects.     * Thanks to Jason Calabrese for help in redefining the interface.     * @param tokenStream     * @param text     * @param maxNumFragments     * @param mergeContiguousFragments     * @throws IOException     */    public final TextFragment[] getBestTextFragments(        TokenStream tokenStream,        String text,        boolean mergeContiguousFragments,        int maxNumFragments)        throws IOException    ...{        ArrayList docFrags = new ArrayList();        StringBuffer newText=new StringBuffer();        TextFragment currentFrag =    new TextFragment(newText,newText.length(), docFrags.size());        fragmentScorer.startFragment(currentFrag);        docFrags.add(currentFrag);        FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);        try        ...{            org.apache.lucene.analysis.Token token;            String tokenText;            int startOffset;            int endOffset;            int lastEndOffset = 0;            textFragmenter.start(text);            TokenGroup tokenGroup=new TokenGroup();            token = tokenStream.next();            while ((token!= null)&&(token.startOffset()<maxDocBytesToAnalyze))            ...{                if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct(token)))                ...{                    //the current token is distinct from previous tokens -                    // markup the cached token group info                    startOffset = tokenGroup.matchStartOffset;                    endOffset = tokenGroup.matchEndOffset;                    tokenText = text.substring(startOffset, endOffset);                    String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);                    //store any whitespace etc from between this and last group                    if (startOffset > lastEndOffset)                        newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));                    newText.append(markedUpText);                    lastEndOffset=Math.max(endOffset, lastEndOffset);                    tokenGroup.clear();                    //check if current token marks the start of a new fragment                    if(textFragmenter.isNewFragment(token))                    ...{                        currentFrag.setScore(fragmentScorer.getFragmentScore());                        //record stats for a new fragment                        currentFrag.textEndPos = newText.length();                        currentFrag =new TextFragment(newText, newText.length(), docFrags.size());                        fragmentScorer.startFragment(currentFrag);                        docFrags.add(currentFrag);                    }                }                tokenGroup.addToken(token,fragmentScorer.getTokenScore(token));//                if(lastEndOffset>maxDocBytesToAnalyze)//                {//                    break;//                }                token = tokenStream.next();            }            currentFrag.setScore(fragmentScorer.getFragmentScore());            if(tokenGroup.numTokens>0)            ...{                //flush the accumulated text (same code as in above loop)                startOffset = tokenGroup.matchStartOffset;                endOffset = tokenGroup.matchEndOffset;                tokenText = text.substring(startOffset, endOffset);                String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);                //store any whitespace etc from between this and last group                if (startOffset > lastEndOffset)                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));                newText.append(markedUpText);                lastEndOffset=Math.max(lastEndOffset,endOffset);            }            //Test what remains of the original text beyond the point where we stopped analyzing             if (//                    if there is text beyond the last token considered..                    (lastEndOffset < text.length())                     &&//                    and that text is not too large...                    (text.length()<maxDocBytesToAnalyze)                )                            ...{                //append it to the last fragment                newText.append(encoder.encodeText(text.substring(lastEndOffset)));            }            currentFrag.textEndPos = newText.length();            //sort the most relevant sections of the text            for (Iterator i = docFrags.iterator(); i.hasNext();)            ...{                currentFrag = (TextFragment) i.next();                //If you are running with a version of Lucene before 11th Sept 03                // you do not have PriorityQueue.insert() - so uncomment the code below                /**//*                                    if (currentFrag.getScore() >= minScore)                                    {                                        fragQueue.put(currentFrag);                                        if (fragQueue.size() > maxNumFragments)                                        { // if hit queue overfull                                            fragQueue.pop(); // remove lowest in hit queue                                            minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore                                        }                                    }                */                //The above code caused a problem as a result of Christoph Goller's 11th Sept 03                //fix to PriorityQueue. The correct method to use here is the new "insert" method                // USE ABOVE CODE IF THIS DOES NOT COMPILE!                fragQueue.insert(currentFrag);            }            //return the most relevant fragments            TextFragment frag[] = new TextFragment[fragQueue.size()];            for (int i = frag.length - 1; i >= 0; i--)            ...{                frag[i] = (TextFragment) fragQueue.pop();            }            //merge any contiguous fragments to improve readability            if(mergeContiguousFragments)            ...{                mergeContiguousFragments(frag);                ArrayList fragTexts = new ArrayList();                for (int i = 0; i < frag.length; i++)                ...{                    if ((frag[i] != null&& (frag[i].getScore() > 0))                    ...{                        fragTexts.add(frag[i]);                    }                }                frag= (TextFragment[]) fragTexts.toArray(new TextFragment[0]);            }            return frag;        }        finally        ...{            if (tokenStream != null)            ...{                try                ...{                    tokenStream.close();                }                catch (Exception e)                ...{                }            }        }    }

相关资源:敏捷开发V1.0.pptx
转载请注明原文地址: https://www.6miu.com/read-5049564.html

最新回复(0)