Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private SpanQuery nearQuery(String field, Element node, Analyzer analyzer) throws XPathException {
    int slop = getSlop(node);
    if (slop < 0)
        slop = 0;//  w w w .  j  av  a2s. co  m
    boolean inOrder = true;
    if (node.hasAttribute("ordered"))
        inOrder = node.getAttribute("ordered").equals("yes");

    if (!hasElementContent(node)) {
        String qstr = getText(node);
        List<SpanTermQuery> list = new ArrayList<>(8);
        try {
            TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr));
            CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                list.add(new SpanTermQuery(new Term(field, termAttr.toString())));
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            throw new XPathException("Error while parsing phrase query: " + qstr);
        }
        return new SpanNearQuery(list.toArray(new SpanTermQuery[list.size()]), slop, inOrder);
    }
    SpanQuery[] children = parseSpanChildren(field, node, analyzer);
    return new SpanNearQuery(children, slop, inOrder);
}

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private String getTerm(String field, String text, Analyzer analyzer) throws XPathException {
    String term = null;/*from ww w  .  j ava 2  s . com*/
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        if (stream.incrementToken()) {
            term = termAttr.toString();
        }
        stream.end();
        stream.close();
        return term;
    } catch (IOException e) {
        throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e);
    }
}

From source file:org.exist.indexing.range.RangeIndexWorker.java

License:Open Source License

protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs)
        throws XPathException {
    final Analyzer analyzer = getAnalyzer(qname, field, docs);
    if (!isCaseSensitive(qname, field, docs)) {
        data = data.toLowerCase();//from ww w.ja  va  2  s  .c  o m
    }
    if (analyzer == null) {
        return new BytesRef(data);
    }
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(data));
        TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
        BytesRef token = null;
        try {
            stream.reset();
            if (stream.incrementToken()) {
                termAttr.fillBytesRef();
                token = termAttr.getBytesRef();
            }
            stream.end();
        } finally {
            stream.close();
        }
        return token;
    } catch (IOException e) {
        throw new XPathException("Error analyzing the query string: " + e.getMessage(), e);
    }
}

From source file:org.fao.geonet.kernel.search.LuceneSearcher.java

License:Open Source License

/**
 * Splits text into tokens using the Analyzer that is matched to the field.
 * @param field//from  www .  jav  a2 s.  c  om
 * @param requestStr
 * @param a
 * @return
 */
private static String analyzeText(String field, String requestStr, PerFieldAnalyzerWrapper a) {

    boolean phrase = false;
    if ((requestStr.startsWith("\"") && requestStr.endsWith("\""))) {
        phrase = true;
    }

    TokenStream ts = a.tokenStream(field, new StringReader(requestStr));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);

    List<String> tokenList = new ArrayList<String>();
    try {
        while (ts.incrementToken()) {
            tokenList.add(termAtt.term());
        }
    } catch (Exception e) {
        // TODO why swallow
        e.printStackTrace();
    }

    StringBuilder result = new StringBuilder();

    for (int i = 0; i < tokenList.size(); i++) {
        if (i > 0) {
            result.append(" ");
            result.append(tokenList.get(i));
        } else {
            result.append(tokenList.get(i));
        }
    }
    String outStr = result.toString();
    if (phrase) {
        outStr = "\"" + outStr + "\"";
    }
    return outStr;
}

From source file:org.hbasene.index.HBaseIndexWriter.java

License:Apache License

public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
    String docId = doc.get(this.primaryKeyField);
    if (docId == null) {
        throw new IllegalArgumentException(
                "Primary Key " + this.primaryKeyField + " not present in the document to be added ");
        // TODO: Special type of exception needed ?

    }/*from  www  . j  a  v a 2 s. com*/
    int position = 0;
    Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>();
    Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>();

    for (Fieldable field : doc.getFields()) {

        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            tokens.addAttribute(TermAttribute.class);
            tokens.addAttribute(PositionIncrementAttribute.class);

            // collect term frequencies per doc
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms
            while (tokens.incrementToken()) {
                String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term());

                List<Integer> pvec = termPositions.get(term);

                if (pvec == null) {
                    pvec = Lists.newArrayList();
                    termPositions.put(term, pvec);
                }

                position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1);
                pvec.add(++position);

            }
            tokens.close();

        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            String term = this.createColumnName(field.name(), field.stringValue());
            String key = term;
            termPositions.put(key, EMPTY_TERM_POSITIONS);

        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T'));

            fieldsToStore.put(field.name(), Bytes.add(prefix, value));
        }
    }
    indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore));
    termPositions.clear();
    fieldsToStore.clear();
}

From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze/*from   www  .j av  a  2  s .c  o  m*/
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String normalize(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsDetectedDuringNormalization(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotNormalizeField(fieldName, e);
    }
}

From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java

License:Open Source License

public Query createQuery(FieldContext fieldContext) {
    final Query perFieldQuery;
    final String fieldName = fieldContext.getField();

    /*/*from  w  w w .ja v  a2  s  .  c  o m*/
     * Store terms per position and detect if for a given position more than one term is present
     */
    TokenStream stream = null;
    boolean isMultiPhrase = false;
    Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>();
    final String sentence = phraseContext.getSentence();
    try {
        Reader reader = new StringReader(sentence);
        stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader);

        TermAttribute termAttribute = stream.addAttribute(TermAttribute.class);
        PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class);

        stream.reset();
        int position = -1; //start at -1 since we apply at least one increment
        List<Term> termsAtSamePosition = null;
        while (stream.incrementToken()) {
            int positionIncrement = 1;
            if (positionAttribute != null) {
                positionIncrement = positionAttribute.getPositionIncrement();
            }

            if (positionIncrement > 0) {
                position += positionIncrement;
                termsAtSamePosition = termsPerPosition.get(position);
            }

            if (termsAtSamePosition == null) {
                termsAtSamePosition = new ArrayList<Term>();
                termsPerPosition.put(position, termsAtSamePosition);
            }

            termsAtSamePosition.add(new Term(fieldName, termAttribute.term()));
            if (termsAtSamePosition.size() > 1) {
                isMultiPhrase = true;
            }
        }
    } catch (IOException e) {
        throw new AssertionFailure("IOException while reading a string. Doh!", e);
    } finally {
        if (stream != null) {
            try {
                stream.end();
                stream.close();
            } catch (IOException e) {
                throw new AssertionFailure("IOException while reading a string. Doh!", e);
            }
        }
    }

    /*
     * Create the appropriate query depending on the conditions
     * note that a MultiPhraseQuery is needed if several terms share the same position
     * as it will do a OR and not a AND like PhraseQuery
     */
    final int size = termsPerPosition.size();
    if (size == 0) {
        throw new SearchException(
                "phrase query returns no term. Is there a problem with your analyzers? " + sentence);
    }
    if (size == 1) {
        final List<Term> terms = termsPerPosition.values().iterator().next();
        if (terms.size() == 1) {
            perFieldQuery = new TermQuery(terms.get(0));
        } else {
            BooleanQuery query = new BooleanQuery();
            for (Term term : terms) {
                query.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            }
            perFieldQuery = query;
        }
    } else {
        if (isMultiPhrase) {
            MultiPhraseQuery query = new MultiPhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.toArray(new Term[value.size()]), entry.getKey());
            }
            perFieldQuery = query;
        } else {
            PhraseQuery query = new PhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.get(0), entry.getKey());
            }
            perFieldQuery = query;
        }
    }
    return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery();
}

From source file:org.hibernate.search.query.dsl.impl.Helper.java

License:Open Source License

static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer)
        throws IOException {
    List<String> terms = new ArrayList<String>();

    // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type.
    if (localText == null) {
        throw new SearchException("Search parameter on field " + fieldName + " could not be converted. "
                + "Are the parameter and the field of the same type?"
                + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters");
    }//from   w  w w  .j a  va 2  s .  com
    Reader reader = new StringReader(localText);
    TokenStream stream = analyzer.reusableTokenStream(fieldName, reader);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        if (attribute.termLength() > 0) {
            String term = attribute.term();
            terms.add(term);
        }
    }
    stream.end();
    stream.close();
    return terms;
}

From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java

License:LGPL

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*  w  w  w .  j  a va 2  s.  c  o  m*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext)
        throws IOException {
    String fieldName = fieldContext.getField();
    Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class)
            .getAnalyzer();
    if (!fieldContext.applyAnalyzer()) {
        // essentially does the Reader to String conversion for us
        analyzer = PassThroughAnalyzer.INSTANCE;
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.hibernate.search.test.util.AnalyzerUtils.java

License:Open Source License

public static Token[] tokensFromAnalysis(Analyzer analyzer, String field, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
    TermAttribute term = stream.addAttribute(TermAttribute.class);
    List<Token> tokenList = new ArrayList<Token>();
    while (stream.incrementToken()) {
        tokenList.add(new Token(term.term(), 0, 0));
    }/*www  .jav  a  2 s .c  o  m*/

    return tokenList.toArray(new Token[tokenList.size()]);
}