Example usage for org.apache.lucene.analysis TokenStream end

List of usage examples for org.apache.lucene.analysis TokenStream end

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream end.

Prototype

public void end() throws IOException 

Source Link

Document

This method is called by the consumer after the last token has been consumed, after #incrementToken() returned false (using the new TokenStream API).

Usage

From source file:org.genemania.completion.lucene.GeneCompletionProvider.java

License:Open Source License

public Long getNodeId(String symbol) {
    try {/*w ww.j  a va  2  s . co m*/
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(GeneIndexBuilder.GENE_FIELD, term.term()));
        }
        tokens.end();
        tokens.close();

        final Set<Long> nodes = new HashSet<Long>();
        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int id) {
                try {
                    Document document = searcher.doc(id);
                    nodes.add(Long.parseLong(document.get(GeneIndexBuilder.NODE_ID_FIELD)));
                } catch (IOException e) {
                    log(e);
                }
            }
        });
        if (nodes.size() > 0) {
            return nodes.iterator().next();
        }
    } catch (IOException e) {
        log(e);
    }
    return null;
}

From source file:org.genemania.data.classification.lucene.LuceneGeneClassifier.java

License:Open Source License

public void classify(final String symbol, final IGeneClassificationHandler handler)
        throws ApplicationException {
    try {/*from   w  w  w .j a v a  2 s . c o m*/
        TokenStream tokens = analyze(symbol);
        PhraseQuery query = new PhraseQuery();
        tokens.reset();
        while (tokens.incrementToken()) {
            TermAttribute term = tokens.getAttribute(TermAttribute.class);
            query.add(new Term(LuceneMediator.GENE_SYMBOL, term.term()));
        }
        tokens.end();
        tokens.close();

        searcher.search(query, new AbstractCollector() {
            @Override
            public void handleHit(int doc) {
                try {
                    Document document = searcher.doc(doc);
                    long organismId = Long.parseLong(document.get(LuceneMediator.GENE_ORGANISM_ID));
                    handler.handleClassification(symbol, organismId);
                } catch (IOException e) {
                    log(e);
                }
            }
        });
    } catch (IOException e) {
        throw new ApplicationException(e);
    }
}

From source file:org.genemania.mediator.lucene.LuceneMediator.java

License:Open Source License

protected PhraseQuery createPhraseQuery(String field, String phrase) throws IOException {
    TokenStream stream = analyze(phrase);
    stream.reset();//w ww  .ja va2  s. c  o m
    PhraseQuery query = new PhraseQuery();
    while (stream.incrementToken()) {
        TermAttribute term = stream.getAttribute(TermAttribute.class);
        query.add(new Term(field, term.term()));
    }
    stream.end();
    stream.close();
    return query;
}

From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze/*w w w.  j a v a  2s.  co  m*/
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String normalize(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsDetectedDuringNormalization(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotNormalizeField(fieldName, e);
    }
}

From source file:org.hibernate.search.indexes.serialization.impl.CopyTokenStream.java

License:Open Source License

private static List<List<AttributeImpl>> fillCache(TokenStream input) throws IOException {
    List<List<AttributeImpl>> results = new ArrayList<List<AttributeImpl>>();
    while (input.incrementToken()) {
        List<AttributeImpl> attrs = new ArrayList<AttributeImpl>();
        results.add(attrs);/*from   ww w .  java2  s  .  co m*/
        Iterator<AttributeImpl> iter = input.getAttributeImplsIterator();
        while (iter.hasNext()) {
            //we need to clone as AttributeImpl instances can be reused across incrementToken() calls
            attrs.add((AttributeImpl) iter.next().clone());
        }
    }
    input.end();
    return results;
}

From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java

License:Open Source License

public Query createQuery(FieldContext fieldContext) {
    final Query perFieldQuery;
    final String fieldName = fieldContext.getField();

    /*//  www .  j a  v  a2  s  .  c om
     * Store terms per position and detect if for a given position more than one term is present
     */
    TokenStream stream = null;
    boolean isMultiPhrase = false;
    Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>();
    final String sentence = phraseContext.getSentence();
    try {
        Reader reader = new StringReader(sentence);
        stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader);

        TermAttribute termAttribute = stream.addAttribute(TermAttribute.class);
        PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class);

        stream.reset();
        int position = -1; //start at -1 since we apply at least one increment
        List<Term> termsAtSamePosition = null;
        while (stream.incrementToken()) {
            int positionIncrement = 1;
            if (positionAttribute != null) {
                positionIncrement = positionAttribute.getPositionIncrement();
            }

            if (positionIncrement > 0) {
                position += positionIncrement;
                termsAtSamePosition = termsPerPosition.get(position);
            }

            if (termsAtSamePosition == null) {
                termsAtSamePosition = new ArrayList<Term>();
                termsPerPosition.put(position, termsAtSamePosition);
            }

            termsAtSamePosition.add(new Term(fieldName, termAttribute.term()));
            if (termsAtSamePosition.size() > 1) {
                isMultiPhrase = true;
            }
        }
    } catch (IOException e) {
        throw new AssertionFailure("IOException while reading a string. Doh!", e);
    } finally {
        if (stream != null) {
            try {
                stream.end();
                stream.close();
            } catch (IOException e) {
                throw new AssertionFailure("IOException while reading a string. Doh!", e);
            }
        }
    }

    /*
     * Create the appropriate query depending on the conditions
     * note that a MultiPhraseQuery is needed if several terms share the same position
     * as it will do a OR and not a AND like PhraseQuery
     */
    final int size = termsPerPosition.size();
    if (size == 0) {
        throw new SearchException(
                "phrase query returns no term. Is there a problem with your analyzers? " + sentence);
    }
    if (size == 1) {
        final List<Term> terms = termsPerPosition.values().iterator().next();
        if (terms.size() == 1) {
            perFieldQuery = new TermQuery(terms.get(0));
        } else {
            BooleanQuery query = new BooleanQuery();
            for (Term term : terms) {
                query.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            }
            perFieldQuery = query;
        }
    } else {
        if (isMultiPhrase) {
            MultiPhraseQuery query = new MultiPhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.toArray(new Term[value.size()]), entry.getKey());
            }
            perFieldQuery = query;
        } else {
            PhraseQuery query = new PhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.get(0), entry.getKey());
            }
            perFieldQuery = query;
        }
    }
    return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery();
}

From source file:org.hibernate.search.query.dsl.impl.Helper.java

License:Open Source License

static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer)
        throws IOException {
    List<String> terms = new ArrayList<String>();

    // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type.
    if (localText == null) {
        throw new SearchException("Search parameter on field " + fieldName + " could not be converted. "
                + "Are the parameter and the field of the same type?"
                + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters");
    }//w w  w. jav  a  2 s . c  om
    Reader reader = new StringReader(localText);
    TokenStream stream = analyzer.reusableTokenStream(fieldName, reader);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        if (attribute.termLength() > 0) {
            String term = attribute.term();
            terms.add(term);
        }
    }
    stream.end();
    stream.close();
    return terms;
}

From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java

License:LGPL

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *///  w w  w.  j  a v a  2 s. c  o  m
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext)
        throws IOException {
    String fieldName = fieldContext.getField();
    Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class)
            .getAnalyzer();
    if (!fieldContext.applyAnalyzer()) {
        // essentially does the Reader to String conversion for us
        analyzer = PassThroughAnalyzer.INSTANCE;
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze/*from ww w  .  j av  a2  s .  c  o m*/
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsInAnalyzedSortableField(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotAnalyzeSortableField(fieldName, e);
    }
}

From source file:org.index.Tag.java

private String getBagOfWords(String text) throws Exception {

    StringBuffer buff = new StringBuffer();
    text = Question.removeTags(text);//from   w w w . j a va  2s.  c  o  m

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new SOAnalyzer(toStem, stopFile);
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        buff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    return buff.toString();
}