Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:org.nuxeo.ecm.platform.categorization.categorizer.tfidf.TfIdfCategorizer.java

License:Open Source License

public List<String> tokenize(String textContent) {
    try {/*from  w  w  w.  j a  va  2s .  co m*/
        List<String> terms = new ArrayList<String>();
        TokenStream tokenStream = getAnalyzer().tokenStream(null, textContent);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            terms.add(charTermAttribute.toString());
        }
        tokenStream.end();
        tokenStream.close();
        return terms;
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.olat.search.ui.SearchInputController.java

License:Apache License

protected Set<String> getHighlightWords(final String searchString) {
    try {/*from  w  ww .ja v  a 2  s  . c  o  m*/
        final Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
        final TokenStream stream = analyzer.tokenStream("content", new StringReader(searchString));
        final TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        for (boolean next = stream.incrementToken(); next; next = stream.incrementToken()) {
            final String term = termAtt.term();
            if (log.isDebug()) {
                log.debug(term);
            }
        }
    } catch (final IOException e) {
        log.error("", e);
    }
    return null;
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
        final boolean filterDigits, final boolean filterWhitespace) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }//from w  ww . j  a  v  a2s. c om

    List<Token> result = new ArrayList<Token>(64);

    final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
        boolean filterDigits, boolean filterWhitespace) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }//from   www.j a  v  a  2 s.co m

    List<String> result = new ArrayList<String>(64);

    final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    Locale loc = stemsAllowed ? getLanguage().getLocale() : null;

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(tokenText);
                if (stemsAllowed) {
                    String origText = str.substring(off.startOffset(), off.endOffset());
                    if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                        result.add(origText);
                    }
                }
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new String[result.size()]);
}

From source file:org.opencloudengine.flamingo.mapreduce.util.Lucene4Utils.java

License:Apache License

public static List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<String>();
    TokenStream tokenStream = analyzer.tokenStream("default", new StringReader(string));
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        result.add(charTermAttribute.toString());
    }/*w ww . j  a va2 s . com*/
    return result;
}

From source file:org.opengrok.web.api.v1.suggester.query.SuggesterQueryParser.java

License:Open Source License

private static List<String> getAllTokens(final Analyzer analyzer, final String field, final String text) {
    List<String> tokens = new LinkedList<>();

    TokenStream ts = null;
    try {// ww  w.  j a v a  2 s  .c o m
        ts = analyzer.tokenStream(field, text);

        CharTermAttribute attr = ts.addAttribute(CharTermAttribute.class);

        ts.reset();
        while (ts.incrementToken()) {
            tokens.add(attr.toString());
        }
    } catch (IOException e) {
        logger.log(Level.WARNING, "Could not analyze query text", e);
    } finally {
        try {
            if (ts != null) {
                ts.end();
                ts.close();
            }
        } catch (IOException e) {
            logger.log(Level.WARNING, "Could not close token stream", e);
        }
    }

    return tokens;
}

From source file:org.opensextant.solrtexttagger.Tagger.java

License:Open Source License

public Tagger(Terms terms, Bits liveDocs, TokenStream tokenStream, TagClusterReducer tagClusterReducer,
        boolean skipAltTokens, boolean ignoreStopWords) throws IOException {
    this.terms = terms;
    this.liveDocs = liveDocs;
    this.tokenStream = tokenStream;
    this.skipAltTokens = skipAltTokens;
    this.ignoreStopWords = ignoreStopWords;
    //    termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    byteRefAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class);
    posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
    offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    lookupAtt = tokenStream.addAttribute(TaggingAttribute.class);
    tokenStream.reset();/*ww  w .  j a va  2s .  com*/

    this.tagClusterReducer = tagClusterReducer;
}

From source file:org.pageseeder.flint.lucene.query.Queries.java

License:Apache License

/**
 * Returns the terms for a field//  w w  w .j a va 2s . c o m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
private static void addTermsToPhrase(String field, String text, Analyzer analyzer, PhraseQuery phrase) {
    try {
        TokenStream stream = analyzer.tokenStream(field, text);
        PositionIncrementAttribute increment = stream.addAttribute(PositionIncrementAttribute.class);
        CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class);
        int position = -1;
        stream.reset();
        while (stream.incrementToken()) {
            position += increment.getPositionIncrement();
            Term term = new Term(field, attribute.toString());
            phrase.add(term, position);
        }
        stream.end();
        stream.close();
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
}

From source file:org.pageseeder.flint.lucene.search.Fields.java

License:Apache License

/**
 * Returns the terms for a field//from  w w w.j a  v  a2 s  .  c  o m
 *
 * @param field    The field
 * @param text     The text to analyze
 * @param analyzer The analyzer
 *
 * @return the corresponding list of terms produced by the analyzer.
 *
 * @throws IOException
 */
public static List<String> toTerms(String field, String text, Analyzer analyzer) {
    List<String> terms = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(text));
        CharTermAttribute attribute = stream.addAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String term = attribute.toString();
            terms.add(term);
        }
        stream.end();
        stream.close();
    } catch (IOException ex) {
        // Should not occur since we use a StringReader
        ex.printStackTrace();
    }
    return terms;
}

From source file:org.solbase.lucenehbase.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber,
        List<String> sortFieldNames) throws CorruptIndexException, IOException {
    // given doc, what are all of terms we indexed
    List<Term> allIndexedTerms = new ArrayList<Term>();
    Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);

    // need to hold onto TermDocMetaData, so it can return this array
    List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();

    byte[] docId = Bytes.toBytes(docNumber);
    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {
        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }/*from  www  .ja va 2s  . com*/

            // collect term information per field
            Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            tokens.reset(); // reset the TokenStream to the first token

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document
            // rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                Term term = new Term(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
                if (termFrequency == null) {
                    termFrequency = new ArrayList<Number>();
                    termFrequency.add(new Integer(0));
                    termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
                }
                // increment
                termFrequency.set(0, termFrequency.get(0).intValue() + 1);

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }

                List<Number> sortValues = new ArrayList<Number>();
                // init sortValues
                for (int i = 0; i < Scorer.numSort; i++) {
                    sortValues.add(new Integer(-1));
                }

                int order = 0;

                // extract sort field value and store it in term doc metadata obj
                for (String fieldName : sortFieldNames) {
                    Fieldable fieldable = doc.getFieldable(fieldName);

                    if (fieldable instanceof EmbeddedSortField) {
                        EmbeddedSortField sortField = (EmbeddedSortField) fieldable;

                        int value = -1;
                        if (sortField.stringValue() != null) {
                            value = Integer.parseInt(sortField.stringValue());
                        }
                        int sortSlot = sortField.getSortSlot();

                        sortValues.set(sortSlot - 1, new Integer(value));
                    } else {
                        // TODO: this logic is used for real time indexing.
                        // hacky. depending on order of sort field names in array
                        int value = -1;
                        if (fieldable.stringValue() != null) {
                            value = Integer.parseInt(fieldable.stringValue());
                        }
                        sortValues.set(order++, new Integer(value));
                    }
                }
                termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
                Term tempTerm = term.getKey();

                byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
                }

                TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes,
                        tempTerm);
                metadatas.add(data);
            }
        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            Term term = new Term(field.name(), field.stringValue());
            allIndexedTerms.add(term);

            byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);

            Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
            termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
            termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));

            TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
            metadatas.add(data);
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            // logic to handle multiple fields w/ same name
            byte[] currentValue = fieldCache.get(field.name());
            if (currentValue == null) {
                fieldCache.put(field.name(), value);
            } else {

                // append new data
                byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length
                        - 1];
                System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
                System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1,
                        SolbaseUtil.delimiter.length);
                System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1,
                        value.length);

                fieldCache.put(field.name(), newValue);
            }
        }
    }

    Put documentPut = new Put(SolbaseUtil.randomize(docNumber));

    // Store each field as a column under this docId
    for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
    }

    // in case of real time update, we need to add back docId field
    if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {

        byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
        // first byte flags if binary or not
        byte[] value = new byte[docIdStr.length + 1];
        System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);

        value[value.length - 1] = (byte) (Byte.MIN_VALUE);
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
    }

    // Finally, Store meta-data so we can delete this document
    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"),
            SolbaseUtil.toBytes(allIndexedTerms).array());

    ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
    return parsedDoc;

}