Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.hibernate.search.util.AnalyzerUtils.java

License:Open Source License

public static List<String> tokenizedTermValues(Analyzer analyzer, String field, String text)
        throws IOException {
    TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    List<String> tokenList = new ArrayList<String>();
    while (stream.incrementToken()) {
        String s = new String(term.buffer(), 0, term.length());
        tokenList.add(s);/*from   w  w  w .j  a va2 s . co m*/
    }
    return tokenList;
}

From source file:org.hibernate.search.util.AnalyzerUtils.java

License:Open Source License

public static Token[] tokensFromAnalysis(Analyzer analyzer, String field, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    List<Token> tokenList = new ArrayList<Token>();
    while (stream.incrementToken()) {
        Token token = new Token();
        token.copyBuffer(term.buffer(), 0, term.length());
        tokenList.add(token);//from w ww . jav a  2 s.  c  o m
    }

    return tokenList.toArray(new Token[tokenList.size()]);
}

From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze// w ww .j av  a 2  s .  c  om
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsInAnalyzedSortableField(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotAnalyzeSortableField(fieldName, e);
    }
}

From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java

License:Apache License

protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException {
    StringReader textReader = new StringReader(searchString);
    TokenStream tokenStream = analyzer.tokenStream(field, textReader);
    tokenStream.reset();/*from   www.  j  av  a2 s  .  c om*/
    List<String> terms = new ArrayList<>();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    while (tokenStream.incrementToken()) {
        terms.add(charTermAttribute.toString());
    }
    textReader.close();
    tokenStream.close();
    analyzer.close();

    BooleanQuery bq = new BooleanQuery();
    if (terms.size() > 0 && !searchString.endsWith(" ")) {
        String last = terms.remove(terms.size() - 1);
        bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST);
    }
    terms.stream().forEach((s) -> {
        bq.add(new TermQuery(new Term(field, s)), Occur.MUST);
    });

    return bq;
}

From source file:org.index.Tag.java

private String getBagOfWords(String text) throws Exception {

    StringBuffer buff = new StringBuffer();
    text = Question.removeTags(text);/*from w  w w  .ja  v a 2 s. c  o m*/

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new SOAnalyzer(toStem, stopFile);
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        buff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    return buff.toString();
}

From source file:org.index.TermScore.java

private List<String> getBagOfWords(String text) throws Exception {

    List<String> terms = new ArrayList<>();
    text = Question.removeTags(text);//from  w ww.j a  v  a2  s .  c  o m

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9);
    /*SOAnalyzer(toStem, stopFile)*/;
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        terms.add(term);
    }

    stream.end();
    stream.close();

    return terms;
}

From source file:org.languagetool.dev.index.LanguageToolFilterTest.java

License:Open Source License

private static void displayTokensWithFullDetails(TokenStream stream) throws IOException {
    final CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    final OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    final TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    int position = 0;
    while (stream.incrementToken()) {
        final int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }/*from   www  .  j a  v  a 2 s. c  om*/
        System.out.print("[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":"
                + type.type() + "] ");
    }
    System.out.println();
}

From source file:org.LexGrid.LexBIG.Impl.Extensions.GenericExtensions.search.SearchExtensionImpl.java

License:Open Source License

public List<String> tokenize(Analyzer analyzer, String field, String keywords) throws IOException {
    List<String> result = new ArrayList<String>();
    StringReader reader = new StringReader(keywords);
    TokenStream stream = analyzer.tokenStream(field, reader);
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    try {//from   w ww.j a v  a  2s .  c om
        stream.reset();
        while (stream.incrementToken()) {
            result.add(termAtt.toString());
        }
        stream.close();
    } finally {
        stream.close();
    }
    return result;
}

From source file:org.meresco.lucene.analysis.MerescoStandardAnalyzer.java

License:Open Source License

public static List<String> readTokenStream(TokenStream tok) throws IOException {
    List<String> terms = new ArrayList<String>();
    CharTermAttribute termAtt = tok.addAttribute(CharTermAttribute.class);
    try {//from  ww w  . ja va2  s .  c  o  m
        tok.reset();
        while (tok.incrementToken()) {
            terms.add(termAtt.toString());
        }
        tok.end();
    } finally {
        tok.close();
    }
    return terms;
}

From source file:org.modeshape.jcr.query.lucene.LuceneQueryFactory.java

License:Open Source License

public Query createQuery(final SelectorName selectorName, String fieldName, FullTextSearch.Term term)
        throws IOException {
    assert fieldName != null;
    if (term instanceof FullTextSearch.Conjunction) {
        FullTextSearch.Conjunction conjunction = (FullTextSearch.Conjunction) term;
        BooleanQuery query = new BooleanQuery();
        for (FullTextSearch.Term nested : conjunction) {
            if (nested instanceof NegationTerm) {
                Query subQuery = createQuery(selectorName, fieldName, ((NegationTerm) nested).getNegatedTerm());
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.MUST_NOT);
                }//from  www.j a  v a  2  s  .c  om
            } else {
                Query subQuery = createQuery(selectorName, fieldName, nested);
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.MUST);
                }
            }
        }
        return query;
    }
    if (term instanceof FullTextSearch.Disjunction) {
        FullTextSearch.Disjunction disjunction = (FullTextSearch.Disjunction) term;
        BooleanQuery query = new BooleanQuery();
        for (FullTextSearch.Term nested : disjunction) {
            if (nested instanceof NegationTerm) {
                Query subQuery = createQuery(selectorName, fieldName, ((NegationTerm) nested).getNegatedTerm());
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.MUST_NOT);
                }
            } else {
                Query subQuery = createQuery(selectorName, fieldName, nested);
                if (!EMPTY_PHRASE_QUERY.equals(subQuery)) {
                    query.add(subQuery, Occur.SHOULD);
                }
            }
        }
        return query;
    }
    if (term instanceof FullTextSearch.SimpleTerm) {
        FullTextSearch.SimpleTerm simple = (FullTextSearch.SimpleTerm) term;
        Analyzer analyzer = getFullTextSearchAnalyzer();
        if (simple.containsWildcards()) {
            // Use the ComplexPhraseQueryParser, but instead of wildcard queries (which don't work with leading
            // wildcards) we should use our like queries (which often use RegexQuery where applicable) ...
            QueryParser parser = new QueryParser(version, fieldName, analyzer) {
                @Override
                protected org.apache.lucene.search.Query getWildcardQuery(String field, String termStr) {
                    return findNodesLike(selectorName, termStr.toLowerCase(), field, CaseOperations.LOWERCASE);
                }
            };
            parser.setAllowLeadingWildcard(true);
            try {
                String expression = simple.getValue();
                // The ComplexPhraseQueryParser only understands the '?' and '*' as being wildcards ...
                expression = expression.replaceAll("(?<![\\\\])_", "?");
                expression = expression.replaceAll("(?<![\\\\])%", "*");
                // // Replace any '-' between tokens, except when preceded or followed by a digit, '*', or '?' ...
                expression = expression.replaceAll("((?<![\\d*?]))[-]((?![\\d*?]))", "$1 $2");
                // Then use the parser ...
                return parser.parse(expression);
            } catch (ParseException e) {
                throw new IOException(e);
            }
        }
        PhraseQuery query = new PhraseQuery();
        query.setSlop(0); // terms must be adjacent
        String expression = simple.getValue();
        // Run the expression through the Lucene analyzer to extract the terms ...
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(expression));
        CharTermAttribute termAttribute = stream.addAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            // The term attribute object has been modified to contain the next term ...
            String analyzedTerm = termAttribute.toString();
            query.add(new Term(fieldName, analyzedTerm));
        }
        return query;
    }
    // Should not get here ...
    assert false;
    return null;
}