Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.hibernate.search.util.AnalyzerUtils.java

License:Open Source License

public static List<String> tokenizedTermValues(Analyzer analyzer, String field, String text)
        throws IOException {
    TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    List<String> tokenList = new ArrayList<String>();
    while (stream.incrementToken()) {
        String s = new String(term.buffer(), 0, term.length());
        tokenList.add(s);/*from   w  w  w .j  a va2 s . co m*/
    }
    return tokenList;
}

From source file:org.hibernate.search.util.AnalyzerUtils.java

License:Open Source License

public static Token[] tokensFromAnalysis(Analyzer analyzer, String field, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    List<Token> tokenList = new ArrayList<Token>();
    while (stream.incrementToken()) {
        Token token = new Token();
        token.copyBuffer(term.buffer(), 0, term.length());
        tokenList.add(token);//from w ww . jav a  2 s.  c  o m
    }

    return tokenList.toArray(new Token[tokenList.size()]);
}

From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze// w ww .j av  a 2  s .  c  om
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsInAnalyzedSortableField(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotAnalyzeSortableField(fieldName, e);
    }
}

From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java

License:Apache License

protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException {
    StringReader textReader = new StringReader(searchString);
    TokenStream tokenStream = analyzer.tokenStream(field, textReader);
    tokenStream.reset();/*from   www.  j  av  a2 s  .  c om*/
    List<String> terms = new ArrayList<>();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    while (tokenStream.incrementToken()) {
        terms.add(charTermAttribute.toString());
    }
    textReader.close();
    tokenStream.close();
    analyzer.close();

    BooleanQuery bq = new BooleanQuery();
    if (terms.size() > 0 && !searchString.endsWith(" ")) {
        String last = terms.remove(terms.size() - 1);
        bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST);
    }
    terms.stream().forEach((s) -> {
        bq.add(new TermQuery(new Term(field, s)), Occur.MUST);
    });

    return bq;
}

From source file:org.index.Tag.java

private String getBagOfWords(String text) throws Exception {

    StringBuffer buff = new StringBuffer();
    text = Question.removeTags(text);/*from w  w w  .ja  v a 2 s. c  o m*/

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new SOAnalyzer(toStem, stopFile);
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        buff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    return buff.toString();
}

From source file:org.index.TermScore.java

private List<String> getBagOfWords(String text) throws Exception {

    List<String> terms = new ArrayList<>();
    text = Question.removeTags(text);//from  w ww.j a  v  a2  s .  c  o m

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9);
    /*SOAnalyzer(toStem, stopFile)*/;
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        terms.add(term);
    }

    stream.end();
    stream.close();

    return terms;
}

From source file:org.languagetool.dev.index.LanguageToolFilterTest.java

License:Open Source License

private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
    final CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    final OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    final TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    int position = 0;
    while (stream.incrementToken()) {
        final int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }/*from   www  .  j a  v  a 2 s. c  om*/
        System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":"
                + type.type() + "] ");
    }
    System.out.println();
}

From source file:org.LexGrid.LexBIG.Impl.Extensions.GenericExtensions.search.SearchExtensionImpl.java

License:Open Source License

public List<String> tokenize(Analyzer analyzer, String field, String keywords) throws IOException {
    List<String> result = new ArrayList<String>();
    StringReader reader = new StringReader(keywords);
    TokenStream stream = analyzer.tokenStream(field, reader);
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    try {//from   w ww.j a v  a  2s .  c om
        stream.reset();
        while (stream.incrementToken()) {
            result.add(termAtt.toString());
        }
        stream.close();
    } finally {
        stream.close();
    }
    return result;
}

From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java

License:Open Source License

public static List<String> readTokenStream(TokenStream tok) throws IOException {
    List<String> terms = new ArrayList<String>();
    CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class);
    try {//from  ww w  . ja va2  s .  c  o  m
        tok.reset();
        while (tok.incrementToken()) {
            terms.add(termAtt.toString());
        }
        tok.end();
    } finally {
        tok.close();
    }
    return terms;
}

From source file:org.modeshape.jcr.query.lucene.LuceneQueryFactory.java

License:Open Source License

public Query createQuery(final SelectorName selectorName, String fieldName, FullTextSearch.Term term)
        throws IOException {
    assert fieldName != null;
    if (term instanceof FullTextSearch.Conjunction) {
        FullTextSearch.Conjunction conjunction = (FullTextSearch.Conjunction) term;
        BooleanQuery query = new BooleanQuery();
        for (FullTextSearch.Term nested : conjunction) {
            if (nested instanceof NegationTerm) {
                Query subQuery = createQuery(selectorName, fieldName, ((NegationTerm) nested).getNegatedTerm());
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.MUST_NOT);
                }//from  www.j a  v a  2  s  .c  om
            } else {
                Query subQuery = createQuery(selectorName, fieldName, nested);
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.MUST);
                }
            }
        }
        return query;
    }
    if (term instanceof FullTextSearch.Disjunction) {
        FullTextSearch.Disjunction disjunction = (FullTextSearch.Disjunction) term;
        BooleanQuery query = new BooleanQuery();
        for (FullTextSearch.Term nested : disjunction) {
            if (nested instanceof NegationTerm) {
                Query subQuery = createQuery(selectorName, fieldName, ((NegationTerm) nested).getNegatedTerm());
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.MUST_NOT);
                }
            } else {
                Query subQuery = createQuery(selectorName, fieldName, nested);
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.SHOULD);
                }
            }
        }
        return query;
    }
    if (term instanceof FullTextSearch.SimpleTerm) {
        FullTextSearch.SimpleTerm simple = (FullTextSearch.SimpleTerm) term;
        Analyzer analyzer = getFullTextSearchAnalyzer();
        if (simple.containsWildcards()) {
            // Use the ComplexPhraseQueryParser, but instead of wildcard queries (which don't work with leading
            // wildcards) we should use our like queries (which often use RegexQuery where applicable) ...
            QueryParser parser = new QueryParser(version, fieldName, analyzer) {
                @Override
                protected org.apache.lucene.search.Query getWildcardQuery(String field, String termStr) {
                    return findNodesLike(selectorName, termStr.toLowerCase(), field, CaseOperations.LOWERCASE);
                }
            };
            parser.setAllowLeadingWildcard(true);
            try {
                String expression = simple.getValue();
                // The ComplexPhraseQueryParser only understands the '?' and '*' as being wildcards ...
                expression = expression.replaceAll("(?<![\\\\])_", "?");
                expression = expression.replaceAll("(?<![\\\\])%", "*");
                // // Replace any '-' between tokens, except when preceded or followed by a digit, '*', or '?' ...
                expression = expression.replaceAll("((?<![\\d*?]))[-]((?![\\d*?]))", "$1 $2");
                // Then use the parser ...
                return parser.parse(expression);
            } catch (ParseException e) {
                throw new IOException(e);
            }
        }
        PhraseQuery query = new PhraseQuery();
        query.setSlop(0); // terms must be adjacent
        String expression = simple.getValue();
        // Run the expression through the Lucene analyzer to extract the terms ...
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(expression));
        CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            // The term attribute object has been modified to contain the next term ...
            String analyzedTerm = termAttribute.toString();
            query.add(new Term(fieldName, analyzedTerm));
        }
        return query;
    }
    // Should not get here ...
    assert false;
    return null;
}