Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is not repeatable.
 * @param checkForAllWords//from  w ww.j  a v  a2 s  .c o  m
 * @param fieldName
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsSet(final boolean checkForAllWords, final String fieldName,
        String fieldQuery) throws IOException {

    StringBuilder sb = new StringBuilder();
    String docType = "*";
    String fieldType = fieldName;
    String wordHash = null;
    int hash = 0;
    BitSetOrSet destination = new BitSetOrSet();
    boolean isVirgin = true;
    String currentRowId = null;

    String mergeid = rowId.substring(0, rowId.lastIndexOf('_'));
    int fieldTypeLoc = fieldName.indexOf('/');
    if (fieldTypeLoc > 0) {
        docType = fieldName.substring(0, fieldTypeLoc);
        fieldType = fieldName.substring(fieldTypeLoc + 1);
    }

    byte[] dataChunk = null;
    try {

        Map<String, Integer> dTypes = new HashMap<String, Integer>(1);
        dTypes.put(docType, 1);
        setDocumentTypeCodes(dTypes);

        Map<String, Integer> fTypes = new HashMap<String, Integer>(1);
        fTypes.put(fieldType, 1);
        setFieldTypeCodes(fTypes);

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        String docTypeCode = "*".equals(docType) ? "*"
                : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString();

        String fldTypeCode = "*".equals(fieldType) ? "*"
                : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString();

        for (String term : terms) {
            if (DEBUG_ENABLED) {
                IdSearchLog.l.debug("Finding Term :" + term);
            }

            hash = Hashing.hash(term);
            wordHash = new Integer(hash).toString();
            sb.delete(0, sb.length());
            fieldQuery = sb.append(docTypeCode).append('|').append(fldTypeCode).append('|').append('*')
                    .append('|').append(hash).append('|').append("*|*").toString();
            sb.delete(0, sb.length());
            currentRowId = mergeid + "_" + wordHash.charAt(0) + "_" + wordHash.charAt(wordHash.length() - 1);

            ComputeKV compute = new ComputeKV();
            compute.kvType = (instruction.getOutputType() == Datatype.FREQUENCY_INDEX) ? Datatype.STRING
                    : instruction.getOutputType();
            compute.kvRepeatation = instruction.getProcessingHint().startsWith("true");
            compute.isCompressed = instruction.getProcessingHint().endsWith("true");
            byte[] data = KvRowReaderFactory.getInstance().getReader(this.isCachable).readStoredProcedureBlob(
                    tableName, currentRowId.getBytes(), compute, null, null, filterQuery, instruction);

            Collection<byte[]> dataL = SortedBytesArray.getInstanceArr().parse(data).values();
            int size = (null == dataL) ? 0 : dataL.size();

            if (checkForAllWords) {
                if (size > 0) {
                    dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                    if (dataChunk == null) {
                        destination.clear();
                        break;
                    }
                } else {
                    destination.clear();
                    break;
                }

                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);

                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                }

                BitSetOrSet source = new BitSetOrSet();
                source.setDocumentSequences(bitSets);
                destination.and(source);

            } else {

                if (size == 0)
                    continue;
                dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                if (dataChunk == null)
                    continue;
                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);
                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                } else {
                    BitSetOrSet source = new BitSetOrSet();
                    source.setDocumentSequences(bitSets);
                    destination.or(source);
                }
            }
        }
        return destination;

    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        msg = msg + "Found Data Chunk\t" + ((null == dataChunk) ? "None" : new String(dataChunk));
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is repeatable.
 * @param checkForAllWords//from   w ww  .ja va  2  s  .  c  o m
 * @param biWord
 * @param triWord
 * @param isCompressed
 * @param isCached
 * @param fieldName
 * @param enableNGram
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsBitset(final boolean checkForAllWords, final String fieldQuery,
        final boolean biWord, final boolean triWord, boolean isCompressed, boolean isCached,
        final String fieldName, boolean enableNGram, boolean checkExactPhrase) throws IOException {

    BitSetOrSet destination = new BitSetOrSet();
    String rowIdPrefix = rowId;

    try {

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        int termsT = terms.size();

        if (enableNGram) {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGRam Explosion");

            int subsequenceLen = 1;

            if (biWord)
                subsequenceLen = 2;
            else if (triWord)
                subsequenceLen = 3;

            /**
             * There may be an pentalty on performance.
             * Don't allow total search phrases > 10 
             */
            if (triWord && (termsT > 4))
                subsequenceLen = 2;
            if ((subsequenceLen == 2) && (termsT > 5))
                subsequenceLen = 1;

            /**
             * "red party gown"
             * "party gown dress"
             * "red party"
             * "party gown"
             * "gown dress"
             * "red"
             * "party"
             * "gown"
             * "dress"
             */
            List<String> phrases = new ArrayList<String>();
            StringBuilder sb = new StringBuilder(1024);

            String[] termsA = new String[terms.size()];
            terms.toArray(termsA);

            for (int subSequence = subsequenceLen; subSequence > 0; subSequence--) {
                if (subSequence <= 0)
                    break;

                for (int wordPosition = 0; wordPosition <= termsT - subSequence; wordPosition++) {

                    for (int pos = 0; pos < subSequence; pos++) {
                        if (pos > 0)
                            sb.append(' ');
                        sb.append(termsA[wordPosition + pos]);
                    }
                    phrases.add(sb.toString());
                    sb.setLength(0);
                }
            }

            for (String phrase : phrases) {
                BitSetOrSet phraseMatches = new BitSetOrSet();

                findATerm(checkForAllWords, isCompressed, isCached, phraseMatches, rowIdPrefix, phrase, false);
                destination.orQueryWithFoundIds.put(phrase, phraseMatches);
                destination.or(phraseMatches);
            }

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGram Query OR trace > " + destination.orQueryWithFoundIds.toString());

            return destination;

        } else {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("Normal Query processing");

            //check for all words 

            BitSetOrSet highRanked = null;
            switch (termsT) {
            case 2: {

                /**
                 * All 2 words are consecutive
                 */
                if (biWord) {
                    Iterator<String> itr = terms.iterator();
                    String phrase = itr.next() + " " + itr.next();

                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /*
                 * Biword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 3: {

                /**
                 * All 3 words are consecutive
                 */
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();

                if (triWord) {
                    String phrase = word1 + " " + word2 + " " + word3;
                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /**
                 * If Check for all words is true minimum required result is 1 for three words
                 * else minimum required result is 0
                 */
                int requiredMinResult = checkForAllWords ? 1 : 0;

                /**
                 * 2 words are consecutive, take them and apply findAll on them
                 */
                if (biWord) {

                    String biword1 = word1 + " " + word2;
                    String biword2 = word2 + " " + word3;
                    String biword3 = word1 + " " + word3;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 4: {
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();
                String word4 = itr.next();
                int requiredMinResult = 0;
                if (triWord) {

                    requiredMinResult = checkForAllWords ? 1 : 0;

                    String triword1 = word1 + " " + word2 + " " + word3;
                    String triword2 = word1 + " " + word3 + " " + word4;
                    String triword3 = word2 + " " + word3 + " " + word4;
                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { triword1, triword2, triword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;
                }

                if (biWord) {

                    requiredMinResult = checkForAllWords ? 2 : 0;

                    String biword1 = word1 + " " + word2;
                    String biword2 = word1 + " " + word3;
                    String biword3 = word1 + " " + word4;
                    String biword4 = word2 + " " + word3;
                    String biword5 = word2 + " " + word4;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3, biword4, biword5 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);

            }
            default: {
                /*
                 * Biword and Triword is not enabled so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            }
        }
    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public void mapFreeTextSet(Field fld, Context context) throws IOException, InterruptedException {

    terms.clear();/*  w w w.  j  a va  2 s . c om*/
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;

    int wordhash;
    String wordhashStr;
    char firstChar;
    char lastChar;

    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();
            wordhash = Hashing.hash(termWord);
            wordhashStr = new Integer(wordhash).toString();
            firstChar = wordhashStr.charAt(0);
            lastChar = wordhashStr.charAt(wordhashStr.length() - 1);

            rowKeyP1 = mergeId + "_" + firstChar + "_" + lastChar;
            appender.delete(0, appender.capacity());
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();
            appender.delete(0, appender.capacity());
            rowVal = appender.append(incrementalIdSeekPosition).append(KVIndexer.FIELD_SEPARATOR)
                    .append(wordhash).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public final void mapFreeTextBitset(final Field fld, final Context context)
        throws IOException, InterruptedException {

    terms.clear();/*from   ww  w. j  av a2  s .  com*/
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;
    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);
        String last2 = null;
        String last1 = null;
        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();

            if (0 == termWord.length())
                continue;

            appender.delete(0, appender.capacity());

            /**
             * Row Key is mergeidFIELDwordhashStr
             */
            boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0);
            String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name;

            rowKeyP1 = rowKeyPrefix + termWord;
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();

            appender.setLength(0);
            rowVal = appender.append(incrementalIdSeekPosition).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));

            if (!fld.isBiWord && !fld.isTriWord)
                continue;

            /**
             * Do Three phrase word
             */
            if (null != last2) {
                appender.setLength(0);

                rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ')
                        .append(termWord).append(' ').append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            /**
             * Do Two phrase word
             */
            if (null != last1) {

                appender.setLength(0);
                rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ')
                        .append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            last2 = last1;
            last1 = termWord;

        }
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("Error While tokenizing : " + e.getMessage());
    } finally {
        try {
            if (null != stream)
                stream.close();
        } catch (Exception ex) {
            IdSearchLog.l.warn("Error during Tokenizer Stream closure");
        }
    }
}

From source file:com.bizosys.unstructured.CustomAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "Abinash", Field.Store.NO, Field.Index.ANALYZED));
    Analyzer analyzer = new CustomAnalyzerExample();

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println(termA.toString());
        }// w w  w  . j  av  a 2s .c  o  m
        sr.close();
    }
}

From source file:com.bizosys.unstructured.IndexWriter.java

License:Apache License

/**
 * Find the last offset./*from   w w  w  . ja  v a 2  s  .c  om*/
 * Find each term offset
 * 
 * @param stream
 * @param docId
 * @param docType
 * @param fieldType
 * @param fieldBoost
 * @param codecs
 * @param uniqueTokens
 * @throws IOException
 */
private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType,
        Map<String, IndexRow> uniqueTokens) throws IOException {

    String token = null;
    int curoffset = 0;
    int lastoffset = 0;
    int position = -1;

    StringBuilder sb = new StringBuilder();
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {

        token = termA.toString();
        curoffset = offsetA.endOffset();

        if (lastoffset != curoffset)
            position++;
        lastoffset = curoffset;

        String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter);
        sb.delete(0, sb.capacity());

        if (uniqueTokens.containsKey(key)) {
            IndexRow existingRow = uniqueTokens.get(key);
            existingRow.set(curoffset, position);
            existingRow.occurance++;
        } else {
            IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position);
            if (null != filter)
                row.docMeta = filter;
            uniqueTokens.put(key, row);
        }
    }
    stream.end();
    stream.close();

    for (IndexRow row : uniqueTokens.values())
        cachedIndex.add(row);
}

From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {

    Document doc = new Document();
    doc.add(new Field("description", "dress/t-shirt dress for \"good boy\"", Field.Store.NO,
            Field.Index.ANALYZED));
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();

    for (Fieldable field : doc.getFields()) {
        String query = "dress/t-shirt dress for \"good boy\"";
        StringReader sr = new StringReader(query);
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);

        if (DEBUG_ENABLED) {
            while (stream.incrementToken()) {
                IdSearchLog.l.debug("Term:" + termA.toString());
            }/*from   w w w .j  av  a  2  s  .c  om*/
        }
        sr.close();
    }

    analyzer.close();

}

From source file:com.bizosys.unstructured.SynonumAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "bengalure is a good city", Field.Store.NO, Field.Index.ANALYZED));
    Map<String, String> syn = new HashMap<String, String>();
    syn.put("bangalore", "bengalure|bangaluru");
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();
    //analyzer.load(null, syn);

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println("Term:" + termA.toString());
        }/*from   w w w.j a  v a2 s. co  m*/
        sr.close();
    }
}

From source file:com.bull.aurocontrol.csst.poc.index.interval.InNumericIntervalQuery.java

License:Apache License

/**
 * Creates a query to find intervals a number is in.
 * @param name The name of the field to search.
 * @param value The search value.// w ww  .j  av  a2 s . c  o m
 * @param precisionStep The precision step used when indexing the field.
 */
public InNumericIntervalQuery(final String name, final long value, final int precisionStep) {
    super(true);
    this.value = value;

    TokenStream stream = new NumericTokenStream(precisionStep).setLongValue(value);

    try {
        stream.reset();
        while (stream.incrementToken()) {
            this.add(new TermQuery(new Term(name, stream.getAttribute(TermAttribute.class).term())),
                    BooleanClause.Occur.SHOULD);
        }
    } catch (IOException e) {
        throw new IllegalStateException("This should never happen - NumericTokenStream does no IO.");
    }
}

From source file:com.chimpler.example.bayes.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*from w  ww  . j a  v  a 2  s . co m*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}