Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:com.zimbra.cs.index.ZimbraAnalyzerTest.java

License:Open Source License

/**
 * We intentionally disable the positionIncrement because we want phrases to match across removed stop words.
 *
 * @see PositionIncrementAttribute/*  w  ww.j a v  a2s.  c  o m*/
 */
@Test
public void positionIncrement() throws Exception {
    TokenStream stream = ZimbraAnalyzer.getInstance().tokenStream(LuceneFields.L_H_SUBJECT,
            new StringReader("It's a test."));
    PositionIncrementAttribute posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        Assert.assertEquals(posIncrAtt.getPositionIncrement(), 1);
    }
    stream.end();
    stream.close();
}

From source file:CopulaResources.TermCooccurence.java

private static List tokenizeString(Analyzer analyzer, String str) {
    List result = new ArrayList<>();
    try {//from  ww w.j  a  v a  2 s  . c o  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(str));
        stream.reset();
        while (stream.incrementToken())
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java

License:Open Source License

private String analyze(String aFieldName, String aString) throws IOException {
    TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString);
    theTokenStream.reset();/*from   ww w  . j a v  a  2s .c  o  m*/
    CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class);
    try {
        if (theTokenStream.incrementToken()) {
            return theCharTerms.toString();
        }
        return null;
    } finally {
        theTokenStream.end();
        theTokenStream.close();
    }
}

From source file:de.twitterlivesearch.analysis.Tokenizer.java

License:Apache License

/**
 * @param stringToAnalyze/*from www. j  a  v  a  2 s .c o m*/
 *            String to be tokenized
 * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used
 *        for analysis
 *
 * @return list of tokens
 */
public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze));
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return tokens;
}

From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java

License:Apache License

public static String tokenizeString(Analyzer analyzer, String string) {
    // Inspired by stackoverflow:
    // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string
    StringBuilder builder = new StringBuilder();
    try {//from  w w w  .  j a  v  a 2  s .  com
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            builder.append(stream.getAttribute(CharTermAttribute.class).toString());
            builder.append(" ");
        }
        stream.close();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return builder.toString().trim();
}

From source file:doc2vec.LuceneDocIterator.java

String preProcess(Analyzer analyzer, String text) throws Exception {

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();// w  w w .j  av  a  2s  . c  o m

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();

        if (labelsStoredWithWords) {
            term = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM)[0]; // the first part is the word
        }

        if (!term.trim().equals(""))
            tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();
    return tokenizedContentBuff.toString();
}

From source file:drakkar.mast.retrieval.analysis.NGramQuery.java

/**
 *
 * @param analyzer/*from  w  w  w. j a va 2  s .  c  om*/
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer analyzer, String queryTerm, String field) throws IOException {
    String words[] = null;

    //remove white spaces
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }

    //one term
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        //more than one term
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = analyzer.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:drakkar.mast.retrieval.ngram.NGramQuery.java

/**
 *
 * @param a//from ww  w.  ja v  a  2  s .  c o  m
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer a, String queryTerm, String field) throws IOException {
    String words[] = null;
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = a.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java

License:Open Source License

protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) {
    if (reader != null && searcher != null) {
        double tf;
        double idf;
        double tfidf;
        EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40);
        TokenStream stream = null;
        CharTermAttribute termAtt;//from ww w.j a  v  a2s .c  o m
        String term;
        double totalWikiDocs = (double) reader.numDocs();
        for (TFIDFTerm word : wordList) {
            try {
                term = "";
                stream = analyzer.tokenStream("field", new StringReader(word.word));
                termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                // print all tokens until stream is exhausted
                while (stream.incrementToken()) {
                    term += (termAtt.toString());
                }
                //                System.out.println(term);
                stream.end();
                tf = (double) word.count / (double) totalWordsDoc;
                double wikiTermFrec = reader.docFreq(new Term("contents", term));
                if (wikiTermFrec != 0) {
                    idf = Math.log(totalWikiDocs / wikiTermFrec);
                    tfidf = tf * idf;
                } else {
                    tfidf = 0;
                }
                word.tfidf = tfidf;
            } catch (IOException ex) {
                logger.error("Error processing the TFIDF", ex);
            } finally {
                try {
                    if (stream != null) {
                        stream.close();
                    }
                } catch (IOException ex) {
                    logger.error("Error processing the TFIDF", ex);
                }

            }

        }
        try {
            reader.close();
        } catch (IOException ex) {
            logger.warn("Error closing lucene reader", ex);
        }
    }
}

From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java

License:Apache License

/**
 * Low level api to get the most relevant (formatted) sections of the document.
 * This method has been made public to allow visibility of score information held in TextFragment objects.
 * Thanks to Jason Calabrese for help in redefining the interface.
 * @param tokenStream//ww w .  jav  a2 s  .com
 * @param text
 * @param maxNumFragments
 * @param mergeContiguousFragments
 * @throws IOException
 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText = new StringBuilder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text, tokenStream);

        TokenGroup tokenGroup = new TokenGroup(tokenStream);

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
            if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
                        + " exceeds length of provided text sized " + text.length());
            }
            if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) {
                //the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.matchStartOffset;
                endOffset = tokenGroup.matchEndOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(endOffset, lastEndOffset);
                tokenGroup.clear();

                //check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment()) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());
                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }

            tokenGroup.addToken(fragmentScorer.getTokenScore());

            //            if(lastEndOffset>maxDocBytesToAnalyze)
            //            {
            //               break;
            //            }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());

        if (tokenGroup.numTokens > 0) {
            //flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.matchStartOffset;
            endOffset = tokenGroup.matchEndOffset;
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
            //store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset)
                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
            newText.append(markedUpText);
            lastEndOffset = Math.max(lastEndOffset, endOffset);
        }

        //Test what remains of the original text beyond the point where we stopped analyzing 
        if (
        //               if there is text beyond the last token considered..
        (lastEndOffset < text.length()) &&
        //               and that text is not too large...
                (text.length() <= maxDocCharsToAnalyze)) {
            //append it to the last fragment
            newText.append(encoder.encodeText(text.substring(lastEndOffset)));
        }

        currentFrag.textEndPos = newText.length();

        //sort the most relevant sections of the text
        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) {
            currentFrag = i.next();

            //If you are running with a version of Lucene before 11th Sept 03
            // you do not have PriorityQueue.insert() - so uncomment the code below
            /*
                   if (currentFrag.getScore() >= minScore)
                   {
                      fragQueue.put(currentFrag);
                      if (fragQueue.size() > maxNumFragments)
                      { // if hit queue overfull
                         fragQueue.pop(); // remove lowest in hit queue
                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                      }
                    
                    
                   }
            */
            //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
            //fix to PriorityQueue. The correct method to use here is the new "insert" method
            // USE ABOVE CODE IF THIS DOES NOT COMPILE!
            fragQueue.insertWithOverflow(currentFrag);
        }

        //return the most relevant fragments
        TextFragment frag[] = new TextFragment[fragQueue.size()];
        for (int i = frag.length - 1; i >= 0; i--) {
            frag[i] = fragQueue.pop();
        }

        //merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(frag);
            ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
            for (int i = 0; i < frag.length; i++) {
                if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                    fragTexts.add(frag[i]);
                }
            }
            frag = fragTexts.toArray(new TextFragment[0]);
        }

        return frag;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}