Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

protected String getTokenStream(Analyzer a, String text) throws IOException {
    StringBuffer sb = new StringBuffer();
    TokenStream s = a.tokenStream(null, new StringReader(text));
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        if (sb.length() != 0) {
            sb.append(" ");
        }/* w  ww . j a  v a2s .  com*/
        sb.append(term.term());
    }
    return sb.toString();
}

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
    TokenStream s = a.tokenStream(null, new StringReader(text));
    int ix = 0;/*from   ww w . j  a  v  a  2s . c o m*/
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        final String word = term.term();
        assertTrue(ix < expected.length);
        assertEquals(expected[ix++], word);
    }
    assertEquals(ix, expected.length);
}

From source file:com.bigdata.search.FullTextIndex.java

License:Open Source License

/**
 * Index a field in a document./*from  w w w  .j  a  v  a2  s .co m*/
 * <p>
 * Note: This method does NOT force a write on the indices. If the <i>buffer</i>
 * overflows, then there will be an index write. Once the caller is done
 * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
 * remaining in their <i>buffer</i> to the indices.
 * <p>
 * Note: If a document is pre-existing, then the existing data for that
 * document MUST be removed unless you know that the fields to be found in
 * the will not have changed (they may have different contents, but the same
 * fields exist in the old and new versions of the document).
 * 
 * @param buffer
 *            Used to buffer writes onto the text index.
 * @param docId
 *            The document identifier.
 * @param fieldId
 *            The field identifier.
 * @param languageCode
 *            The language code -or- <code>null</code> to use the default
 *            {@link Locale}.
 * @param r
 *            A reader on the text to be indexed.
 * @param filterStopwords
 *            if true, filter stopwords from the token stream            
 * 
 * @see TokenBuffer#flush()
 */
public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode,
        final Reader r, final boolean filterStopwords) {

    /*
     * Note: You can invoke this on a read-only index. It is only overflow
     * of the TokenBuffer that requires a writable index. Overflow itself
     * will only occur on {document,field} tuple boundaries, so it will
     * never overflow when indexing a search query.
     */
    //        assertWritable();

    int n = 0;

    // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
    final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords);

    try {

        while (tokenStream.incrementToken()) {

            final TermAttribute term = tokenStream.getAttribute(TermAttribute.class);

            buffer.add(docId, fieldId, term.term());

            n++;

        }

    } catch (IOException ioe) {

        throw new RuntimeException(ioe);

    }

    if (log.isInfoEnabled())
        log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId=" + fieldId);

}

From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java

License:Apache License

private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException {
    if (null == ts)
        return;//w  ww  . j av a 2  s  . com
    TokenStream stream = ts.stream;
    if (null == stream)
        return;

    DocTerms terms = doc.terms;
    if (null == doc.terms) {
        terms = new DocTerms();
        doc.terms = terms;
    }

    String token = null;
    int offset = 0;
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        token = termA.toString();
        offset = offsetA.startOffset();
        Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset);
        terms.getTermList().add(term);
    }
    stream.close();
}

From source file:com.bizosys.hsearch.inpipe.util.StemFilterWrap.java

License:Apache License

public StemFilterWrap(TokenStream in) {
    super(in);//from   w w w . j  av a2 s  .com
    stemmer = Stemmer.getInstance();
    this.termA = (CharTermAttribute) in.getAttribute(CharTermAttribute.class);
    ;
}

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is not repeatable.
 * @param checkForAllWords// w w w . ja v  a2 s  .  c om
 * @param fieldName
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsSet(final boolean checkForAllWords, final String fieldName,
        String fieldQuery) throws IOException {

    StringBuilder sb = new StringBuilder();
    String docType = "*";
    String fieldType = fieldName;
    String wordHash = null;
    int hash = 0;
    BitSetOrSet destination = new BitSetOrSet();
    boolean isVirgin = true;
    String currentRowId = null;

    String mergeid = rowId.substring(0, rowId.lastIndexOf('_'));
    int fieldTypeLoc = fieldName.indexOf('/');
    if (fieldTypeLoc > 0) {
        docType = fieldName.substring(0, fieldTypeLoc);
        fieldType = fieldName.substring(fieldTypeLoc + 1);
    }

    byte[] dataChunk = null;
    try {

        Map<String, Integer> dTypes = new HashMap<String, Integer>(1);
        dTypes.put(docType, 1);
        setDocumentTypeCodes(dTypes);

        Map<String, Integer> fTypes = new HashMap<String, Integer>(1);
        fTypes.put(fieldType, 1);
        setFieldTypeCodes(fTypes);

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        String docTypeCode = "*".equals(docType) ? "*"
                : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString();

        String fldTypeCode = "*".equals(fieldType) ? "*"
                : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString();

        for (String term : terms) {
            if (DEBUG_ENABLED) {
                IdSearchLog.l.debug("Finding Term :" + term);
            }

            hash = Hashing.hash(term);
            wordHash = new Integer(hash).toString();
            sb.delete(0, sb.length());
            fieldQuery = sb.append(docTypeCode).append('|').append(fldTypeCode).append('|').append('*')
                    .append('|').append(hash).append('|').append("*|*").toString();
            sb.delete(0, sb.length());
            currentRowId = mergeid + "_" + wordHash.charAt(0) + "_" + wordHash.charAt(wordHash.length() - 1);

            ComputeKV compute = new ComputeKV();
            compute.kvType = (instruction.getOutputType() == Datatype.FREQUENCY_INDEX) ? Datatype.STRING
                    : instruction.getOutputType();
            compute.kvRepeatation = instruction.getProcessingHint().startsWith("true");
            compute.isCompressed = instruction.getProcessingHint().endsWith("true");
            byte[] data = KvRowReaderFactory.getInstance().getReader(this.isCachable).readStoredProcedureBlob(
                    tableName, currentRowId.getBytes(), compute, null, null, filterQuery, instruction);

            Collection<byte[]> dataL = SortedBytesArray.getInstanceArr().parse(data).values();
            int size = (null == dataL) ? 0 : dataL.size();

            if (checkForAllWords) {
                if (size > 0) {
                    dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                    if (dataChunk == null) {
                        destination.clear();
                        break;
                    }
                } else {
                    destination.clear();
                    break;
                }

                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);

                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                }

                BitSetOrSet source = new BitSetOrSet();
                source.setDocumentSequences(bitSets);
                destination.and(source);

            } else {

                if (size == 0)
                    continue;
                dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                if (dataChunk == null)
                    continue;
                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);
                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                } else {
                    BitSetOrSet source = new BitSetOrSet();
                    source.setDocumentSequences(bitSets);
                    destination.or(source);
                }
            }
        }
        return destination;

    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        msg = msg + "Found Data Chunk\t" + ((null == dataChunk) ? "None" : new String(dataChunk));
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is repeatable.
 * @param checkForAllWords/*w  ww.jav a 2s. c om*/
 * @param biWord
 * @param triWord
 * @param isCompressed
 * @param isCached
 * @param fieldName
 * @param enableNGram
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsBitset(final boolean checkForAllWords, final String fieldQuery,
        final boolean biWord, final boolean triWord, boolean isCompressed, boolean isCached,
        final String fieldName, boolean enableNGram, boolean checkExactPhrase) throws IOException {

    BitSetOrSet destination = new BitSetOrSet();
    String rowIdPrefix = rowId;

    try {

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        int termsT = terms.size();

        if (enableNGram) {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGRam Explosion");

            int subsequenceLen = 1;

            if (biWord)
                subsequenceLen = 2;
            else if (triWord)
                subsequenceLen = 3;

            /**
             * There may be an pentalty on performance.
             * Don't allow total search phrases > 10 
             */
            if (triWord && (termsT > 4))
                subsequenceLen = 2;
            if ((subsequenceLen == 2) && (termsT > 5))
                subsequenceLen = 1;

            /**
             * "red party gown"
             * "party gown dress"
             * "red party"
             * "party gown"
             * "gown dress"
             * "red"
             * "party"
             * "gown"
             * "dress"
             */
            List<String> phrases = new ArrayList<String>();
            StringBuilder sb = new StringBuilder(1024);

            String[] termsA = new String[terms.size()];
            terms.toArray(termsA);

            for (int subSequence = subsequenceLen; subSequence > 0; subSequence--) {
                if (subSequence <= 0)
                    break;

                for (int wordPosition = 0; wordPosition <= termsT - subSequence; wordPosition++) {

                    for (int pos = 0; pos < subSequence; pos++) {
                        if (pos > 0)
                            sb.append(' ');
                        sb.append(termsA[wordPosition + pos]);
                    }
                    phrases.add(sb.toString());
                    sb.setLength(0);
                }
            }

            for (String phrase : phrases) {
                BitSetOrSet phraseMatches = new BitSetOrSet();

                findATerm(checkForAllWords, isCompressed, isCached, phraseMatches, rowIdPrefix, phrase, false);
                destination.orQueryWithFoundIds.put(phrase, phraseMatches);
                destination.or(phraseMatches);
            }

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGram Query OR trace > " + destination.orQueryWithFoundIds.toString());

            return destination;

        } else {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("Normal Query processing");

            //check for all words 

            BitSetOrSet highRanked = null;
            switch (termsT) {
            case 2: {

                /**
                 * All 2 words are consecutive
                 */
                if (biWord) {
                    Iterator<String> itr = terms.iterator();
                    String phrase = itr.next() + " " + itr.next();

                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /*
                 * Biword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 3: {

                /**
                 * All 3 words are consecutive
                 */
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();

                if (triWord) {
                    String phrase = word1 + " " + word2 + " " + word3;
                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /**
                 * If Check for all words is true minimum required result is 1 for three words
                 * else minimum required result is 0
                 */
                int requiredMinResult = checkForAllWords ? 1 : 0;

                /**
                 * 2 words are consecutive, take them and apply findAll on them
                 */
                if (biWord) {

                    String biword1 = word1 + " " + word2;
                    String biword2 = word2 + " " + word3;
                    String biword3 = word1 + " " + word3;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 4: {
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();
                String word4 = itr.next();
                int requiredMinResult = 0;
                if (triWord) {

                    requiredMinResult = checkForAllWords ? 1 : 0;

                    String triword1 = word1 + " " + word2 + " " + word3;
                    String triword2 = word1 + " " + word3 + " " + word4;
                    String triword3 = word2 + " " + word3 + " " + word4;
                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { triword1, triword2, triword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;
                }

                if (biWord) {

                    requiredMinResult = checkForAllWords ? 2 : 0;

                    String biword1 = word1 + " " + word2;
                    String biword2 = word1 + " " + word3;
                    String biword3 = word1 + " " + word4;
                    String biword4 = word2 + " " + word3;
                    String biword5 = word2 + " " + word4;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3, biword4, biword5 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);

            }
            default: {
                /*
                 * Biword and Triword is not enabled so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            }
        }
    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public void mapFreeTextSet(Field fld, Context context) throws IOException, InterruptedException {

    terms.clear();/*w w w .  ja  v  a2 s. co m*/
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;

    int wordhash;
    String wordhashStr;
    char firstChar;
    char lastChar;

    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();
            wordhash = Hashing.hash(termWord);
            wordhashStr = new Integer(wordhash).toString();
            firstChar = wordhashStr.charAt(0);
            lastChar = wordhashStr.charAt(wordhashStr.length() - 1);

            rowKeyP1 = mergeId + "_" + firstChar + "_" + lastChar;
            appender.delete(0, appender.capacity());
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();
            appender.delete(0, appender.capacity());
            rowVal = appender.append(incrementalIdSeekPosition).append(KVIndexer.FIELD_SEPARATOR)
                    .append(wordhash).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public final void mapFreeTextBitset(final Field fld, final Context context)
        throws IOException, InterruptedException {

    terms.clear();//from  w  ww.ja  va2s  .co m
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;
    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);
        String last2 = null;
        String last1 = null;
        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();

            if (0 == termWord.length())
                continue;

            appender.delete(0, appender.capacity());

            /**
             * Row Key is mergeidFIELDwordhashStr
             */
            boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0);
            String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name;

            rowKeyP1 = rowKeyPrefix + termWord;
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();

            appender.setLength(0);
            rowVal = appender.append(incrementalIdSeekPosition).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));

            if (!fld.isBiWord && !fld.isTriWord)
                continue;

            /**
             * Do Three phrase word
             */
            if (null != last2) {
                appender.setLength(0);

                rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ')
                        .append(termWord).append(' ').append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            /**
             * Do Two phrase word
             */
            if (null != last1) {

                appender.setLength(0);
                rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ')
                        .append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            last2 = last1;
            last1 = termWord;

        }
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("Error While tokenizing : " + e.getMessage());
    } finally {
        try {
            if (null != stream)
                stream.close();
        } catch (Exception ex) {
            IdSearchLog.l.warn("Error during Tokenizer Stream closure");
        }
    }
}

From source file:com.bizosys.unstructured.CustomAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "Abinash", Field.Store.NO, Field.Index.ANALYZED));
    Analyzer analyzer = new CustomAnalyzerExample();

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println(termA.toString());
        }/*from  www .j a v a 2s.c  o  m*/
        sr.close();
    }
}