Example usage for org.apache.lucene.analysis TokenStream close

List of usage examples for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *//*from w  w  w  .  ja  v a 2  s .  com*/
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    while (in.incrementToken()) {
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        out.append(' ');
    }
    if (out.length() > 0)
        out.deleteCharAt(out.length() - 1);

    in.close();
    return out.toString();
}

From source file:com.github.mosuka.apache.lucene.example.utils.LuceneExampleUtilTest.java

License:Apache License

public void testCreateAnalyzerWrapper() throws IOException {
    PerFieldAnalyzerWrapper wrapper = LuceneExampleUtil.createAnalyzerWrapper();

    TokenStream tokenStream = null;
    CharTermAttribute charTermAttribute = null;

    List<String> expectedIdTermList = new LinkedList<String>(Arrays.asList("1"));
    List<String> actualIdTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("id", "1");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();/*from   w  w  w .j  a  v  a2  s  . co  m*/
    while (tokenStream.incrementToken()) {
        actualIdTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedIdTermList, actualIdTermList);

    List<String> expectedTextTermList = new LinkedList<String>(
            Arrays.asList("lucene", "is", "a", "full", "text", "search", "library"));
    List<String> actualTextTermList = new LinkedList<String>();
    tokenStream = wrapper.tokenStream("text", "Lucene is a Full-text search library.");
    charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        actualTextTermList.add(charTermAttribute.toString());
    }
    tokenStream.close();
    assertEquals(expectedTextTermList, actualTextTermList);
}

From source file:com.github.pmerienne.trident.ml.preprocessing.EnglishTokenizer.java

License:Apache License

public List<String> tokenize(String text) {
    List<String> words = new ArrayList<String>();
    if (text != null && !text.isEmpty()) {
        TokenStream tokenStream = this.createTokenStream(text);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        try {//from   ww w  .jav a 2 s  .c o m
            while (tokenStream.incrementToken()) {
                String term = charTermAttribute.toString();
                words.add(term);
            }
        } catch (IOException ioe) {
            LOGGER.error("Unable to analyze text. Cause : " + ioe.getMessage(), ioe);
        } finally {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (IOException e) {
                // Can't do nothing!!
                LOGGER.error("Unable to close token stream : " + e.getMessage());
            }
        }
    }

    return words;
}

From source file:com.github.riccardove.easyjasub.lucene.LuceneParser.java

License:Apache License

private List<LuceneToken> readTokens(TokenStream tokenStream) throws IOException {
    ArrayList<LuceneToken> tokens = new ArrayList<LuceneToken>();
    HashMap<Integer, LuceneToken> tokensByStartOffset = new HashMap<Integer, LuceneToken>();
    addAttributes(tokenStream);//from  w  ww . j  a v a2  s .  c om
    tokenStream.reset();

    while (tokenStream.incrementToken()) {
        if (tokenStream.hasAttributes()) {
            LuceneToken token = new LuceneToken();

            readOffset(tokenStream, token);

            // Lucene may output multiple tokens for compound words
            LuceneToken tokenWithSameStartOffset = tokensByStartOffset.get(token.getStartOffset());
            if (tokenWithSameStartOffset != null) {
                if (token.getEndOffset() >= tokenWithSameStartOffset.getEndOffset()) {
                    continue;
                } else {
                    tokens.remove(tokenWithSameStartOffset);
                }
            }

            readReading(tokenStream, token);
            readPartOfSpeech(tokenStream, token);
            readInflection(tokenStream, token);
            readBaseForm(tokenStream, token);

            tokensByStartOffset.put(token.getStartOffset(), token);
            tokens.add(token);
        }
    }

    tokenStream.end();
    tokenStream.close();
    return tokens;
}

From source file:com.github.tteofili.looseen.MinHashClassifier.java

License:Apache License

private ArrayList<String> getTokens(Analyzer analyzer, String field, String value) throws IOException {
    ArrayList<String> tokens = new ArrayList<String>();
    TokenStream ts = analyzer.tokenStream(field, value);
    ts.reset();/*from  w  ww  . ja v a  2 s.  com*/
    while (ts.incrementToken()) {
        CharTermAttribute termAttribute = ts.getAttribute(CharTermAttribute.class);
        String token = new String(termAttribute.buffer(), 0, termAttribute.length());
        tokens.add(token);
    }
    ts.end();
    ts.close();
    return tokens;
}

From source file:com.globalsight.ling.lucene.highlight.Highlighter.java

License:Apache License

/**
 * Low level api to get the most relevant (formatted) sections of
 * the document./*  w w  w  .  j  a  va 2s.  c o  m*/
 *
 * This method has been made public to allow visibility of score
 * information held in TextFragment objects.  Thanks to Jason
 * Calabrese for help in redefining the interface.
 * @param tokenStream
 * @param text
 * @param maxNumFragments
 * @param mergeContiguousFragments
 * @return
 * @throws IOException
 */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
        boolean mergeContiguousFragments, int maxNumFragments) throws IOException {
    ArrayList docFrags = new ArrayList();
    StringBuffer newText = new StringBuffer();

    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {
        org.apache.lucene.analysis.Token token;
        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text);

        TokenGroup tokenGroup = new TokenGroup();

        while ((token = LuceneUtil.getNextToken(tokenStream)) != null) {
            if (tokenGroup.numTokens > 0 && tokenGroup.isDistinct(token)) {
                // the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.startOffset;
                endOffset = tokenGroup.endOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(tokenText, tokenGroup);

                // store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(text.substring(lastEndOffset, startOffset));
                newText.append(markedUpText);
                lastEndOffset = endOffset;
                tokenGroup.clear();

                // check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment(token)) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());

                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }

            tokenGroup.addToken(token, fragmentScorer.getTokenScore(token));

            if (lastEndOffset > maxDocBytesToAnalyze) {
                break;
            }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());

        if (tokenGroup.numTokens > 0) {
            // flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.startOffset;
            endOffset = tokenGroup.endOffset;
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(tokenText, tokenGroup);

            // store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset) {
                newText.append(text.substring(lastEndOffset, startOffset));
            }
            newText.append(markedUpText);
            lastEndOffset = endOffset;
        }

        // append text after end of last token
        if (lastEndOffset < text.length()) {
            newText.append(text.substring(lastEndOffset));
        }

        currentFrag.textEndPos = newText.length();

        // sort the most relevant sections of the text
        for (int i = 0, max = docFrags.size(); i < max; i++) {
            currentFrag = (TextFragment) docFrags.get(i);

            fragQueue.insertWithOverflow(currentFrag);
        }

        // return the most relevant fragments
        TextFragment result[] = new TextFragment[fragQueue.size()];
        for (int i = result.length - 1; i >= 0; i--) {
            result[i] = (TextFragment) fragQueue.pop();
        }

        // merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(result);

            ArrayList fragTexts = new ArrayList();
            for (int i = 0; i < result.length; i++) {
                if (result[i] != null && result[i].getScore() > 0) {
                    fragTexts.add(result[i]);
                }
            }

            result = (TextFragment[]) fragTexts.toArray(new TextFragment[0]);
        }

        return result;
    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer./*from  w  w w.  jav  a 2s .  co  m*/
 *
 * @param p_text fuzzy match format string
 * @return List of c.g.l.tm2.index.Tokens
 */
public static List<Token> createGsTokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));

    tokenStream.reset();
    //GSAttribute gsAtt = tokenStream.addAttribute(GSAttribute.class);
    //org.apache.lucene.analysis.Token luceneToken = null;
    List<String> tokens = new ArrayList<String>();

    while (tokenStream.incrementToken()) {
        // luceneToken = gsAtt.getToken();

        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());

    }
    tokenStream.close();
    return buildTokenList(tokens);
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

/**
 * Create GlobalSight TM tokens from a provided segment string
 * using GsAnalyzer.  This method is suitable for use with TM3
 * fuzzy indices, and does two things differently than createGsTokens():
 * 1) It returns tokens in the order in which they appear
 * 2) It does not collapse duplicate tokens (and correspondingly does
 *    not return count information)/*from   w  w w.  j  a  va 2  s  .  c om*/
 *
 * @param p_text fuzzy match format string
 * @return List of Strings, each representing one token
 */
public static List<String> createTm3Tokens(String p_text, GlobalSightLocale p_locale) throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.globalsight.ling.tm2.lucene.LuceneUtil.java

License:Apache License

@SuppressWarnings("resource")
public static List<String> createTm3TokensNoStopWord(String p_text, GlobalSightLocale p_locale)
        throws Exception {
    GsAnalyzer analyzer = new GsAnalyzer(p_locale, false);
    TokenStream tokenStream = analyzer.tokenStream("blah", new StringReader(p_text));
    tokenStream.reset();/*from www  . j  a  va  2 s .c  o  m*/

    List<String> tokens = new ArrayList<String>();
    while (tokenStream.incrementToken()) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokens.add(termAtt.toString());
    }
    tokenStream.close();

    return tokens;
}

From source file:com.jaeksoft.searchlib.request.SearchField.java

License:Open Source License

final private List<TermQueryItem> getTermQueryFilter(final PerFieldAnalyzer perFieldAnalyzer,
        CompiledAnalyzer compiledAnalyzer, final String queryString) throws IOException {
    TokenStream ts = null;
    TokenQueryFilter.TermQueryFilter tqf = null;
    Analyzer analyzer = compiledAnalyzer != null ? compiledAnalyzer : perFieldAnalyzer.getKeywordAnalyzer();
    try {//from  w  w w  .  j  a  va 2 s  .  c o m
        ts = analyzer.tokenStream(field, new StringReader(queryString));
        tqf = new TermQueryFilter(compiledAnalyzer, field, (float) termBoost, ts);
        while (tqf.incrementToken())
            ;
        ts.end();
        ts.close();

        tqf.sortByOffset();

        TermQueryFilter.includeChildrenBrothers(tqf.termQueryItems);
        for (TermQueryItem termQueryItem : tqf.termQueryItems)
            termQueryItem.includeChildrenBrothers();
        return tqf.termQueryItems;
    } finally {
        IOUtils.close(tqf, ts, analyzer);
    }
}