Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

protected String getTokenStream(Analyzer a, String text) throws IOException {
    StringBuffer sb = new StringBuffer();
    TokenStream s = a.tokenStream(null, new StringReader(text));
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        if (sb.length() != 0) {
            sb.append(" ");
        }/* w  ww . j a  v a2s .  com*/
        sb.append(term.term());
    }
    return sb.toString();
}

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
    TokenStream s = a.tokenStream(null, new StringReader(text));
    int ix = 0;/*from   ww w . j  a  v  a  2s . c o m*/
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        final String word = term.term();
        assertTrue(ix < expected.length);
        assertEquals(expected[ix++], word);
    }
    assertEquals(ix, expected.length);
}

From source file:com.bigdata.search.FullTextIndex.java

License:Open Source License

/**
 * Index a field in a document./*from  w w w  .j  a  v  a2  s .co m*/
 * <p>
 * Note: This method does NOT force a write on the indices. If the <i>buffer</i>
 * overflows, then there will be an index write. Once the caller is done
 * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
 * remaining in their <i>buffer</i> to the indices.
 * <p>
 * Note: If a document is pre-existing, then the existing data for that
 * document MUST be removed unless you know that the fields to be found in
 * the will not have changed (they may have different contents, but the same
 * fields exist in the old and new versions of the document).
 * 
 * @param buffer
 *            Used to buffer writes onto the text index.
 * @param docId
 *            The document identifier.
 * @param fieldId
 *            The field identifier.
 * @param languageCode
 *            The language code -or- <code>null</code> to use the default
 *            {@link Locale}.
 * @param r
 *            A reader on the text to be indexed.
 * @param filterStopwords
 *            if true, filter stopwords from the token stream            
 * 
 * @see TokenBuffer#flush()
 */
public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode,
        final Reader r, final boolean filterStopwords) {

    /*
     * Note: You can invoke this on a read-only index. It is only overflow
     * of the TokenBuffer that requires a writable index. Overflow itself
     * will only occur on {document,field} tuple boundaries, so it will
     * never overflow when indexing a search query.
     */
    //        assertWritable();

    int n = 0;

    // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
    final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords);

    try {

        while (tokenStream.incrementToken()) {

            final TermAttribute term = tokenStream.getAttribute(TermAttribute.class);

            buffer.add(docId, fieldId, term.term());

            n++;

        }

    } catch (IOException ioe) {

        throw new RuntimeException(ioe);

    }

    if (log.isInfoEnabled())
        log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId=" + fieldId);

}

From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java

License:Apache License

private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException {
    if (null == ts)
        return;//w  ww  . j av a 2  s  . com
    TokenStream stream = ts.stream;
    if (null == stream)
        return;

    DocTerms terms = doc.terms;
    if (null == doc.terms) {
        terms = new DocTerms();
        doc.terms = terms;
    }

    String token = null;
    int offset = 0;
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        token = termA.toString();
        offset = offsetA.startOffset();
        Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset);
        terms.getTermList().add(term);
    }
    stream.close();
}

From source file:com.bizosys.hsearch.inpipe.util.StemFilterWrap.java

License:Apache License

public StemFilterWrap(TokenStream in) {
    super(in);//from   w w w . j  av a2 s  .com
    stemmer = Stemmer.getInstance();
    this.termA = (CharTermAttribute) in.getAttribute(CharTermAttribute.class);
    ;
}

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is not repeatable.
 * @param checkForAllWords// w w w . ja v  a2 s  .  c om
 * @param fieldName
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsSet(final boolean checkForAllWords, final String fieldName,
        String fieldQuery) throws IOException {

    StringBuilder sb = new StringBuilder();
    String docType = "*";
    String fieldType = fieldName;
    String wordHash = null;
    int hash = 0;
    BitSetOrSet destination = new BitSetOrSet();
    boolean isVirgin = true;
    String currentRowId = null;

    String mergeid = rowId.substring(0, rowId.lastIndexOf('_'));
    int fieldTypeLoc = fieldName.indexOf('/');
    if (fieldTypeLoc > 0) {
        docType = fieldName.substring(0, fieldTypeLoc);
        fieldType = fieldName.substring(fieldTypeLoc + 1);
    }

    byte[] dataChunk = null;
    try {

        Map<String, Integer> dTypes = new HashMap<String, Integer>(1);
        dTypes.put(docType, 1);
        setDocumentTypeCodes(dTypes);

        Map<String, Integer> fTypes = new HashMap<String, Integer>(1);
        fTypes.put(fieldType, 1);
        setFieldTypeCodes(fTypes);

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        String docTypeCode = "*".equals(docType) ? "*"
                : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString();

        String fldTypeCode = "*".equals(fieldType) ? "*"
                : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString();

        for (String term : terms) {
            if (DEBUG_ENABLED) {
                IdSearchLog.l.debug("Finding Term :" + term);
            }

            hash = Hashing.hash(term);
            wordHash = new Integer(hash).toString();
            sb.delete(0, sb.length());
            fieldQuery = sb.append(docTypeCode).append('|').append(fldTypeCode).append('|').append('*')
                    .append('|').append(hash).append('|').append("*|*").toString();
            sb.delete(0, sb.length());
            currentRowId = mergeid + "_" + wordHash.charAt(0) + "_" + wordHash.charAt(wordHash.length() - 1);

            ComputeKV compute = new ComputeKV();
            compute.kvType = (instruction.getOutputType() == Datatype.FREQUENCY_INDEX) ? Datatype.STRING
                    : instruction.getOutputType();
            compute.kvRepeatation = instruction.getProcessingHint().startsWith("true");
            compute.isCompressed = instruction.getProcessingHint().endsWith("true");
            byte[] data = KvRowReaderFactory.getInstance().getReader(this.isCachable).readStoredProcedureBlob(
                    tableName, currentRowId.getBytes(), compute, null, null, filterQuery, instruction);

            Collection<byte[]> dataL = SortedBytesArray.getInstanceArr().parse(data).values();
            int size = (null == dataL) ? 0 : dataL.size();

            if (checkForAllWords) {
                if (size > 0) {
                    dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                    if (dataChunk == null) {
                        destination.clear();
                        break;
                    }
                } else {
                    destination.clear();
                    break;
                }

                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);

                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                }

                BitSetOrSet source = new BitSetOrSet();
                source.setDocumentSequences(bitSets);
                destination.and(source);

            } else {

                if (size == 0)
                    continue;
                dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                if (dataChunk == null)
                    continue;
                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);
                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                } else {
                    BitSetOrSet source = new BitSetOrSet();
                    source.setDocumentSequences(bitSets);
                    destination.or(source);
                }
            }
        }
        return destination;

    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        msg = msg + "Found Data Chunk\t" + ((null == dataChunk) ? "None" : new String(dataChunk));
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is repeatable.
 * @param checkForAllWords/*w  ww.jav a 2s. c om*/
 * @param biWord
 * @param triWord
 * @param isCompressed
 * @param isCached
 * @param fieldName
 * @param enableNGram
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsBitset(final boolean checkForAllWords, final String fieldQuery,
        final boolean biWord, final boolean triWord, boolean isCompressed, boolean isCached,
        final String fieldName, boolean enableNGram, boolean checkExactPhrase) throws IOException {

    BitSetOrSet destination = new BitSetOrSet();
    String rowIdPrefix = rowId;

    try {

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        int termsT = terms.size();

        if (enableNGram) {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGRam Explosion");

            int subsequenceLen = 1;

            if (biWord)
                subsequenceLen = 2;
            else if (triWord)
                subsequenceLen = 3;

            /**
             * There may be an pentalty on performance.
             * Don't allow total search phrases > 10 
             */
            if (triWord && (termsT > 4))
                subsequenceLen = 2;
            if ((subsequenceLen == 2) && (termsT > 5))
                subsequenceLen = 1;

            /**
             * "red party gown"
             * "party gown dress"
             * "red party"
             * "party gown"
             * "gown dress"
             * "red"
             * "party"
             * "gown"
             * "dress"
             */
            List<String> phrases = new ArrayList<String>();
            StringBuilder sb = new StringBuilder(1024);

            String[] termsA = new String[terms.size()];
            terms.toArray(termsA);

            for (int subSequence = subsequenceLen; subSequence > 0; subSequence--) {
                if (subSequence <= 0)
                    break;

                for (int wordPosition = 0; wordPosition <= termsT - subSequence; wordPosition++) {

                    for (int pos = 0; pos < subSequence; pos++) {
                        if (pos > 0)
                            sb.append(' ');
                        sb.append(termsA[wordPosition + pos]);
                    }
                    phrases.add(sb.toString());
                    sb.setLength(0);
                }
            }

            for (String phrase : phrases) {
                BitSetOrSet phraseMatches = new BitSetOrSet();

                findATerm(checkForAllWords, isCompressed, isCached, phraseMatches, rowIdPrefix, phrase, false);
                destination.orQueryWithFoundIds.put(phrase, phraseMatches);
                destination.or(phraseMatches);
            }

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGram Query OR trace > " + destination.orQueryWithFoundIds.toString());

            return destination;

        } else {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("Normal Query processing");

            //check for all words 

            BitSetOrSet highRanked = null;
            switch (termsT) {
            case 2: {

                /**
                 * All 2 words are consecutive
                 */
                if (biWord) {
                    Iterator<String> itr = terms.iterator();
                    String phrase = itr.next() + " " + itr.next();

                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /*
                 * Biword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 3: {

                /**
                 * All 3 words are consecutive
                 */
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();

                if (triWord) {
                    String phrase = word1 + " " + word2 + " " + word3;
                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /**
                 * If Check for all words is true minimum required result is 1 for three words
                 * else minimum required result is 0
                 */
                int requiredMinResult = checkForAllWords ? 1 : 0;

                /**
                 * 2 words are consecutive, take them and apply findAll on them
                 */
                if (biWord) {

                    String biword1 = word1 + " " + word2;
                    String biword2 = word2 + " " + word3;
                    String biword3 = word1 + " " + word3;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 4: {
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();
                String word4 = itr.next();
                int requiredMinResult = 0;
                if (triWord) {

                    requiredMinResult = checkForAllWords ? 1 : 0;

                    String triword1 = word1 + " " + word2 + " " + word3;
                    String triword2 = word1 + " " + word3 + " " + word4;
                    String triword3 = word2 + " " + word3 + " " + word4;
                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { triword1, triword2, triword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;
                }

                if (biWord) {

                    requiredMinResult = checkForAllWords ? 2 : 0;

                    String biword1 = word1 + " " + word2;
                    String biword2 = word1 + " " + word3;
                    String biword3 = word1 + " " + word4;
                    String biword4 = word2 + " " + word3;
                    String biword5 = word2 + " " + word4;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3, biword4, biword5 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);

            }
            default: {
                /*
                 * Biword and Triword is not enabled so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            }
        }
    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public void mapFreeTextSet(Field fld, Context context) throws IOException, InterruptedException {

    terms.clear();/*w w w .  ja  v  a2 s. co m*/
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;

    int wordhash;
    String wordhashStr;
    char firstChar;
    char lastChar;

    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();
            wordhash = Hashing.hash(termWord);
            wordhashStr = new Integer(wordhash).toString();
            firstChar = wordhashStr.charAt(0);
            lastChar = wordhashStr.charAt(wordhashStr.length() - 1);

            rowKeyP1 = mergeId + "_" + firstChar + "_" + lastChar;
            appender.delete(0, appender.capacity());
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();
            appender.delete(0, appender.capacity());
            rowVal = appender.append(incrementalIdSeekPosition).append(KVIndexer.FIELD_SEPARATOR)
                    .append(wordhash).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public final void mapFreeTextBitset(final Field fld, final Context context)
        throws IOException, InterruptedException {

    terms.clear();//from  w  ww.ja  va2s  .co m
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;
    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);
        String last2 = null;
        String last1 = null;
        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();

            if (0 == termWord.length())
                continue;

            appender.delete(0, appender.capacity());

            /**
             * Row Key is mergeidFIELDwordhashStr
             */
            boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0);
            String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name;

            rowKeyP1 = rowKeyPrefix + termWord;
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();

            appender.setLength(0);
            rowVal = appender.append(incrementalIdSeekPosition).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));

            if (!fld.isBiWord && !fld.isTriWord)
                continue;

            /**
             * Do Three phrase word
             */
            if (null != last2) {
                appender.setLength(0);

                rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ')
                        .append(termWord).append(' ').append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            /**
             * Do Two phrase word
             */
            if (null != last1) {

                appender.setLength(0);
                rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ')
                        .append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            last2 = last1;
            last1 = termWord;

        }
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("Error While tokenizing : " + e.getMessage());
    } finally {
        try {
            if (null != stream)
                stream.close();
        } catch (Exception ex) {
            IdSearchLog.l.warn("Error during Tokenizer Stream closure");
        }
    }
}

From source file:com.bizosys.unstructured.CustomAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "Abinash", Field.Store.NO, Field.Index.ANALYZED));
    Analyzer analyzer = new CustomAnalyzerExample();

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println(termA.toString());
        }/*from  www .j a v a 2s.c  o  m*/
        sr.close();
    }
}