Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java

License:Open Source License

public List<String> tokenize(String textContent) {
    try {/*from  w  w  w.  j a  va  2s .  co m*/
        List<String> terms = new ArrayList<String>();
        TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            terms.add(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        return terms;
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.olat.search.ui.SearchInputController.java

License:Apache License

protected Set<String> getHighlightWords(final String searchString) {
    try {/*from  w  ww .ja v  a 2  s  . c  o  m*/
        final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        final TokenStream stream = analyzer.tokenStream("content", new StringReader(searchString));
        final TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        for (boolean next = stream.incrementToken(); next; next = stream.incrementToken()) {
            final String term = termAtt.term();
            if (log.isDebug()) {
                log.debug(term);
            }
        }
    } catch (final IOException e) {
        log.error("", e);
    }
    return null;
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
        final boolean filterDigits, final boolean filterWhitespace) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }//from w  ww . j  a  v  a2s. c om

    List<Token> result = new ArrayList<Token>(64);

    final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
        boolean filterDigits, boolean filterWhitespace) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }//from   www.j a  v  a  2 s.co m

    List<String> result = new ArrayList<String>(64);

    final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    Locale loc = stemsAllowed ? getLanguage().getLocale() : null;

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(tokenText);
                if (stemsAllowed) {
                    String origText = str.substring(off.startOffset(), off.endOffset());
                    if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                        result.add(origText);
                    }
                }
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new String[result.size()]);
}

From source file:org.opencloudengine.flamingo.mapreduce.util.Lucene4Utils.java

License:Apache License

public static List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<String>();
    TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(string));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        result.add(charTermAttribute.toString());
    }/*w ww . j  a va2 s . com*/
    return result;
}

From source file:org.opengrok.web.api.v1.suggester.query.SuggesterQueryParser.java

License:Open Source License

private static List<String> getAllTokens(final Analyzer analyzer, final String field, final String text) {
    List<String> tokens = new LinkedList<>();

    TokenStream ts = null;
    try {// ww  w.  j a v a  2 s  .c o m
        ts = analyzer.tokenStream(field, text);

        CharTermAttribute attr = ts.addAttribute(CharTermAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            tokens.add(attr.toString());
        }
    } catch (IOException e) {
        logger.log(Level.WARNING, "Could not analyze query text", e);
    } finally {
        try {
            if (ts != null) {
                ts.end();
                ts.close();
            }
        } catch (IOException e) {
            logger.log(Level.WARNING, "Could not close token stream", e);
        }
    }

    return tokens;
}

From source file:org.opensextant.solrtexttagger.Tagger.java

License:Open Source License

public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, TagClusterReducer tagClusterReducer,
        boolean skipAltTokens, boolean ignoreStopWords) throws IOException {
    this.terms = terms;
    this.liveDocs = liveDocs;
    this.tokenStream = tokenStream;
    this.skipAltTokens = skipAltTokens;
    this.ignoreStopWords = ignoreStopWords;
    //    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    lookupAtt = tokenStream.addAttribute(TaggingAttribute.class);
    tokenStream.reset();/*ww  w .  j a va  2s .  com*/

    this.tagClusterReducer = tagClusterReducer;
}

From source file:org.pageseeder.flint.lucene.query.Queries.java

License:Apache License

/**
 * Returns the terms for a field//  w w  w .j a va 2s . c o m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) {
    try {
        TokenStream stream = analyzer.tokenStream(field, text);
        PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
        CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class);
        int position = -1;
        stream.reset();
        while (stream.incrementToken()) {
            position += increment.getPositionIncrement();
            Term term = new Term(field, attribute.toString());
            phrase.add(term, position);
        }
        stream.end();
        stream.close();
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
}

From source file:org.pageseeder.flint.lucene.search.Fields.java

License:Apache License

/**
 * Returns the terms for a field//from  w w w.j a  v  a2 s  .  c  o m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
public static List<String> toTerms(String field, String text, Analyzer analyzer) {
    List<String> terms = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
        CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = attribute.toString();
            terms.add(term);
        }
        stream.end();
        stream.close();
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
    return terms;
}

From source file:org.solbase.lucenehbase.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber,
        List<String> sortFieldNames) throws CorruptIndexException, IOException {
    // given doc, what are all of terms we indexed
    List<Term> allIndexedTerms = new ArrayList<Term>();
    Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);

    // need to hold onto TermDocMetaData, so it can return this array
    List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();

    byte[] docId = Bytes.toBytes(docNumber);
    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {
        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }/*from  www  .ja va 2s  . com*/

            // collect term information per field
            Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            tokens.reset(); // reset the TokenStream to the first token

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document
            // rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                Term term = new Term(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
                if (termFrequency == null) {
                    termFrequency = new ArrayList<Number>();
                    termFrequency.add(new Integer(0));
                    termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
                }
                // increment
                termFrequency.set(0, termFrequency.get(0).intValue() + 1);

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }

                List<Number> sortValues = new ArrayList<Number>();
                // init sortValues
                for (int i = 0; i < Scorer.numSort; i++) {
                    sortValues.add(new Integer(-1));
                }

                int order = 0;

                // extract sort field value and store it in term doc metadata obj
                for (String fieldName : sortFieldNames) {
                    Fieldable fieldable = doc.getFieldable(fieldName);

                    if (fieldable instanceof EmbeddedSortField) {
                        EmbeddedSortField sortField = (EmbeddedSortField) fieldable;

                        int value = -1;
                        if (sortField.stringValue() != null) {
                            value = Integer.parseInt(sortField.stringValue());
                        }
                        int sortSlot = sortField.getSortSlot();

                        sortValues.set(sortSlot - 1, new Integer(value));
                    } else {
                        // TODO: this logic is used for real time indexing.
                        // hacky. depending on order of sort field names in array
                        int value = -1;
                        if (fieldable.stringValue() != null) {
                            value = Integer.parseInt(fieldable.stringValue());
                        }
                        sortValues.set(order++, new Integer(value));
                    }
                }
                termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
                Term tempTerm = term.getKey();

                byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
                }

                TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes,
                        tempTerm);
                metadatas.add(data);
            }
        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            Term term = new Term(field.name(), field.stringValue());
            allIndexedTerms.add(term);

            byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);

            Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
            termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
            termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));

            TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
            metadatas.add(data);
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            // logic to handle multiple fields w/ same name
            byte[] currentValue = fieldCache.get(field.name());
            if (currentValue == null) {
                fieldCache.put(field.name(), value);
            } else {

                // append new data
                byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length
                        - 1];
                System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
                System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1,
                        SolbaseUtil.delimiter.length);
                System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1,
                        value.length);

                fieldCache.put(field.name(), newValue);
            }
        }
    }

    Put documentPut = new Put(SolbaseUtil.randomize(docNumber));

    // Store each field as a column under this docId
    for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
    }

    // in case of real time update, we need to add back docId field
    if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {

        byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
        // first byte flags if binary or not
        byte[] value = new byte[docIdStr.length + 1];
        System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);

        value[value.length - 1] = (byte) (Byte.MIN_VALUE);
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
    }

    // Finally, Store meta-data so we can delete this document
    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"),
            SolbaseUtil.toBytes(allIndexedTerms).array());

    ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
    return parsedDoc;

}