Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:org.gridkit.coherence.search.lucene.TokenStreamCheck.java

License:Apache License

@Test
public void analyze() throws IOException {

    WhitespaceAnalyzer wa = new WhitespaceAnalyzer(Version.LUCENE_42);
    wa.getOffsetGap("xxx");
    TokenStream ts = wa.tokenStream("test", new StringReader("red black tree"));
    ts.reset();
    ts.incrementToken();/*from   w w w  .ja v  a  2 s.  c o m*/
    ts.getAttribute(CharTermAttribute.class).buffer();

    CapturedTokenStream cts = new CapturedTokenStream(ts);
    cts.reset();
    cts.incrementToken();
    cts.getAttribute(CharTermAttribute.class).buffer();
}

From source file:org.grouplens.samantha.modeler.featurizer.FeatureExtractorUtilities.java

License:Open Source License

static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) {
    TokenStream ts = analyzer.tokenStream(termField, text);
    Map<String, Integer> termFreq = new HashMap<>();
    try {//from ww w . j  a  v a  2  s .c  o m
        ts.reset();
        while (ts.incrementToken()) {
            String term = ts.reflectAsString(false);
            int cnt = termFreq.getOrDefault(term, 0);
            termFreq.put(term, cnt + 1);
        }
        ts.close();
    } catch (IOException e) {
        logger.error("{}", e.getMessage());
        throw new BadRequestException(e);
    }
    return termFreq;
}

From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze//  w w w  .j a  v  a  2 s . c om
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String normalize(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsDetectedDuringNormalization(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotNormalizeField(fieldName, e);
    }
}

From source file:org.hibernate.search.backend.spi.SingularTermDeletionQuery.java

License:LGPL

@Override
public Query toLuceneQuery(DocumentBuilderIndexedEntity documentBuilder) {
    AnalyzerReference analyzerReferenceForEntity = documentBuilder.getAnalyzerReference();
    String stringValue = documentBuilder.objectToString(fieldName, this.getValue(),
            new ContextualExceptionBridgeHelper());

    if (this.getType() == Type.STRING) {
        try {/*from  w  w w . j  av  a  2 s .c  o  m*/
            if (analyzerReferenceForEntity.is(RemoteAnalyzerReference.class)) {
                // no need to take into account the analyzer here as it will be dealt with remotely
                return new TermQuery(new Term(this.getFieldName(), stringValue));
            }

            ScopedLuceneAnalyzer analyzerForEntity = (ScopedLuceneAnalyzer) analyzerReferenceForEntity
                    .unwrap(LuceneAnalyzerReference.class).getAnalyzer();
            TokenStream tokenStream = analyzerForEntity.tokenStream(this.getFieldName(), stringValue);
            tokenStream.reset();
            try {
                BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
                while (tokenStream.incrementToken()) {
                    String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
                    booleanQueryBuilder.add(new TermQuery(new Term(this.getFieldName(), term)), Occur.FILTER);
                }
                return booleanQueryBuilder.build();
            } finally {
                tokenStream.close();
            }
        } catch (IOException e) {
            throw new AssertionFailure(
                    "No IOException can occur while using a TokenStream that is generated via String");
        }
    } else {
        FieldBridge fieldBridge = documentBuilder.getBridge(fieldName);
        if (NumericFieldUtils.isNumericFieldBridge(fieldBridge)) {
            return NumericFieldUtils.createExactMatchQuery(fieldName, this.getValue());
        } else {
            return new TermQuery(new Term(this.getFieldName(), stringValue));
        }
    }
}

From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java

License:Open Source License

public Query createQuery(FieldContext fieldContext) {
    final Query perFieldQuery;
    final String fieldName = fieldContext.getField();

    /*/*w ww.  ja  v  a 2  s. c o m*/
     * Store terms per position and detect if for a given position more than one term is present
     */
    TokenStream stream = null;
    boolean isMultiPhrase = false;
    Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>();
    final String sentence = phraseContext.getSentence();
    try {
        Reader reader = new StringReader(sentence);
        stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader);

        TermAttribute termAttribute = stream.addAttribute(TermAttribute.class);
        PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class);

        stream.reset();
        int position = -1; //start at -1 since we apply at least one increment
        List<Term> termsAtSamePosition = null;
        while (stream.incrementToken()) {
            int positionIncrement = 1;
            if (positionAttribute != null) {
                positionIncrement = positionAttribute.getPositionIncrement();
            }

            if (positionIncrement > 0) {
                position += positionIncrement;
                termsAtSamePosition = termsPerPosition.get(position);
            }

            if (termsAtSamePosition == null) {
                termsAtSamePosition = new ArrayList<Term>();
                termsPerPosition.put(position, termsAtSamePosition);
            }

            termsAtSamePosition.add(new Term(fieldName, termAttribute.term()));
            if (termsAtSamePosition.size() > 1) {
                isMultiPhrase = true;
            }
        }
    } catch (IOException e) {
        throw new AssertionFailure("IOException while reading a string. Doh!", e);
    } finally {
        if (stream != null) {
            try {
                stream.end();
                stream.close();
            } catch (IOException e) {
                throw new AssertionFailure("IOException while reading a string. Doh!", e);
            }
        }
    }

    /*
     * Create the appropriate query depending on the conditions
     * note that a MultiPhraseQuery is needed if several terms share the same position
     * as it will do a OR and not a AND like PhraseQuery
     */
    final int size = termsPerPosition.size();
    if (size == 0) {
        throw new SearchException(
                "phrase query returns no term. Is there a problem with your analyzers? " + sentence);
    }
    if (size == 1) {
        final List<Term> terms = termsPerPosition.values().iterator().next();
        if (terms.size() == 1) {
            perFieldQuery = new TermQuery(terms.get(0));
        } else {
            BooleanQuery query = new BooleanQuery();
            for (Term term : terms) {
                query.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            }
            perFieldQuery = query;
        }
    } else {
        if (isMultiPhrase) {
            MultiPhraseQuery query = new MultiPhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.toArray(new Term[value.size()]), entry.getKey());
            }
            perFieldQuery = query;
        } else {
            PhraseQuery query = new PhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.get(0), entry.getKey());
            }
            perFieldQuery = query;
        }
    }
    return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery();
}

From source file:org.hibernate.search.query.dsl.impl.Helper.java

License:Open Source License

static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer)
        throws IOException {
    List<String> terms = new ArrayList<String>();

    // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type.
    if (localText == null) {
        throw new SearchException("Search parameter on field " + fieldName + " could not be converted. "
                + "Are the parameter and the field of the same type?"
                + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters");
    }/*from ww w  .  jav a 2s. c  o m*/
    Reader reader = new StringReader(localText);
    TokenStream stream = analyzer.reusableTokenStream(fieldName, reader);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        if (attribute.termLength() > 0) {
            String term = attribute.term();
            terms.add(term);
        }
    }
    stream.end();
    stream.close();
    return terms;
}

From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java

License:LGPL

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*from  ww w .jav  a2s .  co  m*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext)
        throws IOException {
    String fieldName = fieldContext.getField();
    Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class)
            .getAnalyzer();
    if (!fieldContext.applyAnalyzer()) {
        // essentially does the Reader to String conversion for us
        analyzer = PassThroughAnalyzer.INSTANCE;
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.hibernate.search.test.serialization.SerializationTest.java

License:Open Source License

private boolean compareTokenStreams(TokenStream original, TokenStream copy) {
    if (original == null) {
        return copy == null;
    }/* w  ww . jav  a 2  s. c om*/
    try {
        original.reset();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    SerializableTokenStream serOriginal = CopyTokenStream.buildSerializabletokenStream(original);
    SerializableTokenStream serCopy = CopyTokenStream.buildSerializabletokenStream(copy);
    if (serOriginal.getStream().size() != serCopy.getStream().size()) {
        return false;
    }
    for (int i = 0; i < serOriginal.getStream().size(); i++) {
        List<AttributeImpl> origToken = serOriginal.getStream().get(i);
        List<AttributeImpl> copyToken = serCopy.getStream().get(i);
        if (origToken.size() != copyToken.size()) {
            return false;
        }
        for (int j = 0; j < origToken.size(); j++) {
            AttributeImpl origAttr = origToken.get(j);
            AttributeImpl copyAttr = copyToken.get(j);
            if (origAttr.getClass() != copyAttr.getClass()) {
                return false;
            }
            testAttributeTypes(origAttr, copyAttr);
        }
    }
    return true;
}

From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze/*  www.j  av a  2s.  c  o m*/
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsInAnalyzedSortableField(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotAnalyzeSortableField(fieldName, e);
    }
}

From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java

License:Apache License

protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException {
    StringReader textReader = new StringReader(searchString);
    TokenStream tokenStream = analyzer.tokenStream(field, textReader);
    tokenStream.reset();
    List<String> terms = new ArrayList<>();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    while (tokenStream.incrementToken()) {
        terms.add(charTermAttribute.toString());
    }/*from  w w w . ja  v a2 s. co  m*/
    textReader.close();
    tokenStream.close();
    analyzer.close();

    BooleanQuery bq = new BooleanQuery();
    if (terms.size() > 0 && !searchString.endsWith(" ")) {
        String last = terms.remove(terms.size() - 1);
        bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST);
    }
    terms.stream().forEach((s) -> {
        bq.add(new TermQuery(new Term(field, s)), Occur.MUST);
    });

    return bq;
}