Example usage for org.apache.lucene.analysis TokenStream close

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream close.

Prototype

@Override
public void close() throws IOException

Source Link

Document

Releases resources associated with this stream.

Usage

From source file:org.grouplens.samantha.modeler.featurizer.FeatureExtractorUtilities.java

License:Open Source License

static public Map<String, Integer> getTermFreq(Analyzer analyzer, String text, String termField) {
    TokenStream ts = analyzer.tokenStream(termField, text);
    Map<String, Integer> termFreq = new HashMap<>();
    try {/*  ww w. j  a  v a2 s .c om*/
        ts.reset();
        while (ts.incrementToken()) {
            String term = ts.reflectAsString(false);
            int cnt = termFreq.getOrDefault(term, 0);
            termFreq.put(term, cnt + 1);
        }
        ts.close();
    } catch (IOException e) {
        logger.error("{}", e.getMessage());
        throw new BadRequestException(e);
    }
    return termFreq;
}

From source file:org.hbasene.index.HBaseIndexWriter.java

License:Apache License

public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
    String docId = doc.get(this.primaryKeyField);
    if (docId == null) {
        throw new IllegalArgumentException(
                "Primary Key " + this.primaryKeyField + " not present in the document to be added ");
        // TODO: Special type of exception needed ?

    }/*from w  ww . j  a  v  a  2 s .  c  o  m*/
    int position = 0;
    Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>();
    Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>();

    for (Fieldable field : doc.getFields()) {

        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            tokens.addAttribute(TermAttribute.class);
            tokens.addAttribute(PositionIncrementAttribute.class);

            // collect term frequencies per doc
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms
            while (tokens.incrementToken()) {
                String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term());

                List<Integer> pvec = termPositions.get(term);

                if (pvec == null) {
                    pvec = Lists.newArrayList();
                    termPositions.put(term, pvec);
                }

                position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1);
                pvec.add(++position);

            }
            tokens.close();

        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            String term = this.createColumnName(field.name(), field.stringValue());
            String key = term;
            termPositions.put(key, EMPTY_TERM_POSITIONS);

        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T'));

            fieldsToStore.put(field.name(), Bytes.add(prefix, value));
        }
    }
    indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore));
    termPositions.clear();
    fieldsToStore.clear();
}

From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze/*from  ww  w  .ja  v a  2s  .c o  m*/
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String normalize(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsDetectedDuringNormalization(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotNormalizeField(fieldName, e);
    }
}

From source file:org.hibernate.search.backend.spi.SingularTermDeletionQuery.java

License:LGPL

@Override
public Query toLuceneQuery(DocumentBuilderIndexedEntity documentBuilder) {
    AnalyzerReference analyzerReferenceForEntity = documentBuilder.getAnalyzerReference();
    String stringValue = documentBuilder.objectToString(fieldName, this.getValue(),
            new ContextualExceptionBridgeHelper());

    if (this.getType() == Type.STRING) {
        try {/*from w  w w . jav a  2  s .co  m*/
            if (analyzerReferenceForEntity.is(RemoteAnalyzerReference.class)) {
                // no need to take into account the analyzer here as it will be dealt with remotely
                return new TermQuery(new Term(this.getFieldName(), stringValue));
            }

            ScopedLuceneAnalyzer analyzerForEntity = (ScopedLuceneAnalyzer) analyzerReferenceForEntity
                    .unwrap(LuceneAnalyzerReference.class).getAnalyzer();
            TokenStream tokenStream = analyzerForEntity.tokenStream(this.getFieldName(), stringValue);
            tokenStream.reset();
            try {
                BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
                while (tokenStream.incrementToken()) {
                    String term = tokenStream.getAttribute(CharTermAttribute.class).toString();
                    booleanQueryBuilder.add(new TermQuery(new Term(this.getFieldName(), term)), Occur.FILTER);
                }
                return booleanQueryBuilder.build();
            } finally {
                tokenStream.close();
            }
        } catch (IOException e) {
            throw new AssertionFailure(
                    "No IOException can occur while using a TokenStream that is generated via String");
        }
    } else {
        FieldBridge fieldBridge = documentBuilder.getBridge(fieldName);
        if (NumericFieldUtils.isNumericFieldBridge(fieldBridge)) {
            return NumericFieldUtils.createExactMatchQuery(fieldName, this.getValue());
        } else {
            return new TermQuery(new Term(this.getFieldName(), stringValue));
        }
    }
}

From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java

License:Open Source License

public Query createQuery(FieldContext fieldContext) {
    final Query perFieldQuery;
    final String fieldName = fieldContext.getField();

    /*/*from ww w . ja  v a2s . c o m*/
     * Store terms per position and detect if for a given position more than one term is present
     */
    TokenStream stream = null;
    boolean isMultiPhrase = false;
    Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>();
    final String sentence = phraseContext.getSentence();
    try {
        Reader reader = new StringReader(sentence);
        stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader);

        TermAttribute termAttribute = stream.addAttribute(TermAttribute.class);
        PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class);

        stream.reset();
        int position = -1; //start at -1 since we apply at least one increment
        List<Term> termsAtSamePosition = null;
        while (stream.incrementToken()) {
            int positionIncrement = 1;
            if (positionAttribute != null) {
                positionIncrement = positionAttribute.getPositionIncrement();
            }

            if (positionIncrement > 0) {
                position += positionIncrement;
                termsAtSamePosition = termsPerPosition.get(position);
            }

            if (termsAtSamePosition == null) {
                termsAtSamePosition = new ArrayList<Term>();
                termsPerPosition.put(position, termsAtSamePosition);
            }

            termsAtSamePosition.add(new Term(fieldName, termAttribute.term()));
            if (termsAtSamePosition.size() > 1) {
                isMultiPhrase = true;
            }
        }
    } catch (IOException e) {
        throw new AssertionFailure("IOException while reading a string. Doh!", e);
    } finally {
        if (stream != null) {
            try {
                stream.end();
                stream.close();
            } catch (IOException e) {
                throw new AssertionFailure("IOException while reading a string. Doh!", e);
            }
        }
    }

    /*
     * Create the appropriate query depending on the conditions
     * note that a MultiPhraseQuery is needed if several terms share the same position
     * as it will do a OR and not a AND like PhraseQuery
     */
    final int size = termsPerPosition.size();
    if (size == 0) {
        throw new SearchException(
                "phrase query returns no term. Is there a problem with your analyzers? " + sentence);
    }
    if (size == 1) {
        final List<Term> terms = termsPerPosition.values().iterator().next();
        if (terms.size() == 1) {
            perFieldQuery = new TermQuery(terms.get(0));
        } else {
            BooleanQuery query = new BooleanQuery();
            for (Term term : terms) {
                query.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            }
            perFieldQuery = query;
        }
    } else {
        if (isMultiPhrase) {
            MultiPhraseQuery query = new MultiPhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.toArray(new Term[value.size()]), entry.getKey());
            }
            perFieldQuery = query;
        } else {
            PhraseQuery query = new PhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.get(0), entry.getKey());
            }
            perFieldQuery = query;
        }
    }
    return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery();
}

From source file:org.hibernate.search.query.dsl.impl.Helper.java

License:Open Source License

static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer)
        throws IOException {
    List<String> terms = new ArrayList<String>();

    // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type.
    if (localText == null) {
        throw new SearchException("Search parameter on field " + fieldName + " could not be converted. "
                + "Are the parameter and the field of the same type?"
                + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters");
    }//from w w  w. j  a va2  s .c o  m
    Reader reader = new StringReader(localText);
    TokenStream stream = analyzer.reusableTokenStream(fieldName, reader);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        if (attribute.termLength() > 0) {
            String term = attribute.term();
            terms.add(term);
        }
    }
    stream.end();
    stream.close();
    return terms;
}

From source file:org.hibernate.search.util.impl.InternalAnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze/*from  w  w w .j a v  a  2  s.  c om*/
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String analyzeSortableValue(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsInAnalyzedSortableField(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotAnalyzeSortableField(fieldName, e);
    }
}

From source file:org.ihtsdo.otf.query.lucene.LuceneIndexer.java

License:Apache License

protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException {
    StringReader textReader = new StringReader(searchString);
    TokenStream tokenStream = analyzer.tokenStream(field, textReader);
    tokenStream.reset();//from   w w w .j av  a 2  s .  c o m
    List<String> terms = new ArrayList<>();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    while (tokenStream.incrementToken()) {
        terms.add(charTermAttribute.toString());
    }
    textReader.close();
    tokenStream.close();
    analyzer.close();

    BooleanQuery bq = new BooleanQuery();
    if (terms.size() > 0 && !searchString.endsWith(" ")) {
        String last = terms.remove(terms.size() - 1);
        bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST);
    }
    terms.stream().forEach((s) -> {
        bq.add(new TermQuery(new Term(field, s)), Occur.MUST);
    });

    return bq;
}

From source file:org.index.Tag.java

private String getBagOfWords(String text) throws Exception {

    StringBuffer buff = new StringBuffer();
    text = Question.removeTags(text);/*from   ww w . j  ava2s. com*/

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new SOAnalyzer(toStem, stopFile);
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        buff.append(term).append(" ");
    }

    stream.end();
    stream.close();

    return buff.toString();
}

From source file:org.index.TermScore.java

private List<String> getBagOfWords(String text) throws Exception {

    List<String> terms = new ArrayList<>();
    text = Question.removeTags(text);//from  w  w  w. j  a  va2  s  . c om

    boolean toStem = Boolean.parseBoolean(prop.getProperty("stem", "true"));
    String stopFile = prop.getProperty("stopfile");
    Analyzer analyzer = new WhitespaceAnalyzer(Version.LUCENE_4_9);
    /*SOAnalyzer(toStem, stopFile)*/;
    TokenStream stream = analyzer.tokenStream("bow", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        terms.add(term);
    }

    stream.end();
    stream.close();

    return terms;
}