Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private SpanQuery nearQuery(String field, Element node, Analyzer analyzer) throws XPathException {
    int slop = getSlop(node);
    if (slop < 0)
        slop = 0;//  w w w .  j  av  a2s. co  m
    boolean inOrder = true;
    if (node.hasAttribute("ordered"))
        inOrder = node.getAttribute("ordered").equals("yes");

    if (!hasElementContent(node)) {
        String qstr = getText(node);
        List<SpanTermQuery> list = new ArrayList<>(8);
        try {
            TokenStream stream = analyzer.tokenStream(field, new StringReader(qstr));
            CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            while (stream.incrementToken()) {
                list.add(new SpanTermQuery(new Term(field, termAttr.toString())));
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            throw new XPathException("Error while parsing phrase query: " + qstr);
        }
        return new SpanNearQuery(list.toArray(new SpanTermQuery[list.size()]), slop, inOrder);
    }
    SpanQuery[] children = parseSpanChildren(field, node, analyzer);
    return new SpanNearQuery(children, slop, inOrder);
}

From source file:org.exist.indexing.lucene.XMLToQuery.java

License:Open Source License

private String getTerm(String field, String text, Analyzer analyzer) throws XPathException {
    String term = null;/*from ww w  .  j ava 2  s . com*/
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
        CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        if (stream.incrementToken()) {
            term = termAttr.toString();
        }
        stream.end();
        stream.close();
        return term;
    } catch (IOException e) {
        throw new XPathException("Lucene index error while creating query: " + e.getMessage(), e);
    }
}

From source file:org.exist.indexing.range.RangeIndexWorker.java

License:Open Source License

protected BytesRef analyzeContent(String field, QName qname, String data, DocumentSet docs)
        throws XPathException {
    final Analyzer analyzer = getAnalyzer(qname, field, docs);
    if (!isCaseSensitive(qname, field, docs)) {
        data = data.toLowerCase();//from ww w.ja  va  2  s  .c  o m
    }
    if (analyzer == null) {
        return new BytesRef(data);
    }
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(data));
        TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
        BytesRef token = null;
        try {
            stream.reset();
            if (stream.incrementToken()) {
                termAttr.fillBytesRef();
                token = termAttr.getBytesRef();
            }
            stream.end();
        } finally {
            stream.close();
        }
        return token;
    } catch (IOException e) {
        throw new XPathException("Error analyzing the query string: " + e.getMessage(), e);
    }
}

From source file:org.fao.geonet.kernel.search.LuceneSearcher.java

License:Open Source License

/**
 * Splits text into tokens using the Analyzer that is matched to the field.
 * @param field//from  www .  jav  a2 s.  c  om
 * @param requestStr
 * @param a
 * @return
 */
private static String analyzeText(String field, String requestStr, PerFieldAnalyzerWrapper a) {

    boolean phrase = false;
    if ((requestStr.startsWith("\"") && requestStr.endsWith("\""))) {
        phrase = true;
    }

    TokenStream ts = a.tokenStream(field, new StringReader(requestStr));
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);

    List<String> tokenList = new ArrayList<String>();
    try {
        while (ts.incrementToken()) {
            tokenList.add(termAtt.term());
        }
    } catch (Exception e) {
        // TODO why swallow
        e.printStackTrace();
    }

    StringBuilder result = new StringBuilder();

    for (int i = 0; i < tokenList.size(); i++) {
        if (i > 0) {
            result.append(" ");
            result.append(tokenList.get(i));
        } else {
            result.append(tokenList.get(i));
        }
    }
    String outStr = result.toString();
    if (phrase) {
        outStr = "\"" + outStr + "\"";
    }
    return outStr;
}

From source file:org.hbasene.index.HBaseIndexWriter.java

License:Apache License

public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
    String docId = doc.get(this.primaryKeyField);
    if (docId == null) {
        throw new IllegalArgumentException(
                "Primary Key " + this.primaryKeyField + " not present in the document to be added ");
        // TODO: Special type of exception needed ?

    }/*from  www  . j  a  v a 2 s. com*/
    int position = 0;
    Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>();
    Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>();

    for (Fieldable field : doc.getFields()) {

        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            tokens.addAttribute(TermAttribute.class);
            tokens.addAttribute(PositionIncrementAttribute.class);

            // collect term frequencies per doc
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms
            while (tokens.incrementToken()) {
                String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term());

                List<Integer> pvec = termPositions.get(term);

                if (pvec == null) {
                    pvec = Lists.newArrayList();
                    termPositions.put(term, pvec);
                }

                position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1);
                pvec.add(++position);

            }
            tokens.close();

        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            String term = this.createColumnName(field.name(), field.stringValue());
            String key = term;
            termPositions.put(key, EMPTY_TERM_POSITIONS);

        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T'));

            fieldsToStore.put(field.name(), Bytes.add(prefix, value));
        }
    }
    indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore));
    termPositions.clear();
    fieldsToStore.clear();
}

From source file:org.hibernate.search.backend.lucene.util.impl.AnalyzerUtils.java

License:LGPL

/**
 * Returns the first token resulting from the analysis, logging a warning if there are more than one token.
 *
 * @param analyzer the Lucene analyzer to use
 * @param fieldName the name of the field: might affect the analyzer behavior
 * @param text the value to analyze/*from   www  .j av  a  2  s .c  o  m*/
 * @return the first token resulting from the analysis
 *
 * @throws SearchException if a problem occurs when analyzing the sortable field's value.
 */
public static String normalize(Analyzer analyzer, String fieldName, String text) {
    final TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(text));
    try {
        try {
            String firstToken = null;
            CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            if (stream.incrementToken()) {
                firstToken = new String(term.buffer(), 0, term.length());
                if (stream.incrementToken()) {
                    log.multipleTermsDetectedDuringNormalization(fieldName);
                } else {
                    stream.end();
                }
            }
            return firstToken;
        } finally {
            stream.close();
        }
    } catch (SearchException | IOException e) {
        throw log.couldNotNormalizeField(fieldName, e);
    }
}

From source file:org.hibernate.search.query.dsl.impl.ConnectedMultiFieldsPhraseQueryBuilder.java

License:Open Source License

public Query createQuery(FieldContext fieldContext) {
    final Query perFieldQuery;
    final String fieldName = fieldContext.getField();

    /*/*from  w  w w .ja v  a2  s  .  c  o m*/
     * Store terms per position and detect if for a given position more than one term is present
     */
    TokenStream stream = null;
    boolean isMultiPhrase = false;
    Map<Integer, List<Term>> termsPerPosition = new HashMap<Integer, List<Term>>();
    final String sentence = phraseContext.getSentence();
    try {
        Reader reader = new StringReader(sentence);
        stream = queryContext.getQueryAnalyzer().reusableTokenStream(fieldName, reader);

        TermAttribute termAttribute = stream.addAttribute(TermAttribute.class);
        PositionIncrementAttribute positionAttribute = stream.addAttribute(PositionIncrementAttribute.class);

        stream.reset();
        int position = -1; //start at -1 since we apply at least one increment
        List<Term> termsAtSamePosition = null;
        while (stream.incrementToken()) {
            int positionIncrement = 1;
            if (positionAttribute != null) {
                positionIncrement = positionAttribute.getPositionIncrement();
            }

            if (positionIncrement > 0) {
                position += positionIncrement;
                termsAtSamePosition = termsPerPosition.get(position);
            }

            if (termsAtSamePosition == null) {
                termsAtSamePosition = new ArrayList<Term>();
                termsPerPosition.put(position, termsAtSamePosition);
            }

            termsAtSamePosition.add(new Term(fieldName, termAttribute.term()));
            if (termsAtSamePosition.size() > 1) {
                isMultiPhrase = true;
            }
        }
    } catch (IOException e) {
        throw new AssertionFailure("IOException while reading a string. Doh!", e);
    } finally {
        if (stream != null) {
            try {
                stream.end();
                stream.close();
            } catch (IOException e) {
                throw new AssertionFailure("IOException while reading a string. Doh!", e);
            }
        }
    }

    /*
     * Create the appropriate query depending on the conditions
     * note that a MultiPhraseQuery is needed if several terms share the same position
     * as it will do a OR and not a AND like PhraseQuery
     */
    final int size = termsPerPosition.size();
    if (size == 0) {
        throw new SearchException(
                "phrase query returns no term. Is there a problem with your analyzers? " + sentence);
    }
    if (size == 1) {
        final List<Term> terms = termsPerPosition.values().iterator().next();
        if (terms.size() == 1) {
            perFieldQuery = new TermQuery(terms.get(0));
        } else {
            BooleanQuery query = new BooleanQuery();
            for (Term term : terms) {
                query.add(new TermQuery(term), BooleanClause.Occur.SHOULD);
            }
            perFieldQuery = query;
        }
    } else {
        if (isMultiPhrase) {
            MultiPhraseQuery query = new MultiPhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.toArray(new Term[value.size()]), entry.getKey());
            }
            perFieldQuery = query;
        } else {
            PhraseQuery query = new PhraseQuery();
            query.setSlop(phraseContext.getSlop());
            for (Map.Entry<Integer, List<Term>> entry : termsPerPosition.entrySet()) {
                final List<Term> value = entry.getValue();
                query.add(value.get(0), entry.getKey());
            }
            perFieldQuery = query;
        }
    }
    return fieldContext.getFieldCustomizer().setWrappedQuery(perFieldQuery).createQuery();
}

From source file:org.hibernate.search.query.dsl.impl.Helper.java

License:Open Source License

static List<String> getAllTermsFromText(String fieldName, String localText, Analyzer analyzer)
        throws IOException {
    List<String> terms = new ArrayList<String>();

    // Can't deal with null at this point. Likely returned by some FieldBridge not recognizing the type.
    if (localText == null) {
        throw new SearchException("Search parameter on field " + fieldName + " could not be converted. "
                + "Are the parameter and the field of the same type?"
                + "Alternatively, apply the ignoreFieldBridge() option to " + "pass String parameters");
    }//from   w  w w  .j a  va 2  s .  com
    Reader reader = new StringReader(localText);
    TokenStream stream = analyzer.reusableTokenStream(fieldName, reader);
    TermAttribute attribute = stream.addAttribute(TermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        if (attribute.termLength() > 0) {
            String term = attribute.term();
            terms.add(term);
        }
    }
    stream.end();
    stream.close();
    return terms;
}

From source file:org.hibernate.search.query.dsl.impl.MoreLikeThisBuilder.java

License:LGPL

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 *//*  w  w  w .  j  a va 2  s.  c  o  m*/
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, FieldContext fieldContext)
        throws IOException {
    String fieldName = fieldContext.getField();
    Analyzer analyzer = queryContext.getQueryAnalyzerReference().unwrap(LuceneAnalyzerReference.class)
            .getAnalyzer();
    if (!fieldContext.applyAnalyzer()) {
        // essentially does the Reader to String conversion for us
        analyzer = PassThroughAnalyzer.INSTANCE;
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    try {
        int tokenCount = 0;
        // for every token
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String word = termAtt.toString();
            tokenCount++;
            if (tokenCount > maxNumTokensParsed) {
                break;
            }
            if (isNoiseWord(word)) {
                continue;
            }

            // increment frequency
            Int cnt = termFreqMap.get(word);
            if (cnt == null) {
                termFreqMap.put(word, new Int());
            } else {
                cnt.x++;
            }
        }
        ts.end();
    } finally {
        IOUtils.closeWhileHandlingException(ts);
    }
}

From source file:org.hibernate.search.test.util.AnalyzerUtils.java

License:Open Source License

public static Token[] tokensFromAnalysis(Analyzer analyzer, String field, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
    TermAttribute term = stream.addAttribute(TermAttribute.class);
    List<Token> tokenList = new ArrayList<Token>();
    while (stream.incrementToken()) {
        tokenList.add(new Token(term.term(), 0, 0));
    }/*www  .jav  a  2 s .c  o  m*/

    return tokenList.toArray(new Token[tokenList.size()]);
}