Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is not repeatable.
 * @param checkForAllWords//from  w ww.j  a v  a2 s  .c o  m
 * @param fieldName
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsSet(final boolean checkForAllWords, final String fieldName,
        String fieldQuery) throws IOException {

    StringBuilder sb = new StringBuilder();
    String docType = "*";
    String fieldType = fieldName;
    String wordHash = null;
    int hash = 0;
    BitSetOrSet destination = new BitSetOrSet();
    boolean isVirgin = true;
    String currentRowId = null;

    String mergeid = rowId.substring(0, rowId.lastIndexOf('_'));
    int fieldTypeLoc = fieldName.indexOf('/');
    if (fieldTypeLoc > 0) {
        docType = fieldName.substring(0, fieldTypeLoc);
        fieldType = fieldName.substring(fieldTypeLoc + 1);
    }

    byte[] dataChunk = null;
    try {

        Map<String, Integer> dTypes = new HashMap<String, Integer>(1);
        dTypes.put(docType, 1);
        setDocumentTypeCodes(dTypes);

        Map<String, Integer> fTypes = new HashMap<String, Integer>(1);
        fTypes.put(fieldType, 1);
        setFieldTypeCodes(fTypes);

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        String docTypeCode = "*".equals(docType) ? "*"
                : new Integer(DocumentTypeCodes.getInstance().getCode(docType)).toString();

        String fldTypeCode = "*".equals(fieldType) ? "*"
                : new Integer(FieldTypeCodes.getInstance().getCode(fieldType)).toString();

        for (String term : terms) {
            if (DEBUG_ENABLED) {
                IdSearchLog.l.debug("Finding Term :" + term);
            }

            hash = Hashing.hash(term);
            wordHash = new Integer(hash).toString();
            sb.delete(0, sb.length());
            fieldQuery = sb.append(docTypeCode).append('|').append(fldTypeCode).append('|').append('*')
                    .append('|').append(hash).append('|').append("*|*").toString();
            sb.delete(0, sb.length());
            currentRowId = mergeid + "_" + wordHash.charAt(0) + "_" + wordHash.charAt(wordHash.length() - 1);

            ComputeKV compute = new ComputeKV();
            compute.kvType = (instruction.getOutputType() == Datatype.FREQUENCY_INDEX) ? Datatype.STRING
                    : instruction.getOutputType();
            compute.kvRepeatation = instruction.getProcessingHint().startsWith("true");
            compute.isCompressed = instruction.getProcessingHint().endsWith("true");
            byte[] data = KvRowReaderFactory.getInstance().getReader(this.isCachable).readStoredProcedureBlob(
                    tableName, currentRowId.getBytes(), compute, null, null, filterQuery, instruction);

            Collection<byte[]> dataL = SortedBytesArray.getInstanceArr().parse(data).values();
            int size = (null == dataL) ? 0 : dataL.size();

            if (checkForAllWords) {
                if (size > 0) {
                    dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                    if (dataChunk == null) {
                        destination.clear();
                        break;
                    }
                } else {
                    destination.clear();
                    break;
                }

                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);

                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                }

                BitSetOrSet source = new BitSetOrSet();
                source.setDocumentSequences(bitSets);
                destination.and(source);

            } else {

                if (size == 0)
                    continue;
                dataChunk = dataL.isEmpty() ? null : dataL.iterator().next();
                if (dataChunk == null)
                    continue;
                BitSetWrapper bitSets = SortedBytesBitset.getInstanceBitset().bytesToBitSet(dataChunk, 0,
                        dataChunk.length);
                if (isVirgin) {
                    destination.setDocumentSequences(bitSets);
                    isVirgin = false;
                    continue;
                } else {
                    BitSetOrSet source = new BitSetOrSet();
                    source.setDocumentSequences(bitSets);
                    destination.or(source);
                }
            }
        }
        return destination;

    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        msg = msg + "Found Data Chunk\t" + ((null == dataChunk) ? "None" : new String(dataChunk));
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.impl.StorageReader.java

License:Apache License

/**
 * Returns the ids for analyzed field that is repeatable.
 * @param checkForAllWords//from   w ww  .ja va  2  s  .  c  o m
 * @param biWord
 * @param triWord
 * @param isCompressed
 * @param isCached
 * @param fieldName
 * @param enableNGram
 * @return
 * @throws IOException
 */
private final BitSetOrSet readStorageTextIdsBitset(final boolean checkForAllWords, final String fieldQuery,
        final boolean biWord, final boolean triWord, boolean isCompressed, boolean isCached,
        final String fieldName, boolean enableNGram, boolean checkExactPhrase) throws IOException {

    BitSetOrSet destination = new BitSetOrSet();
    String rowIdPrefix = rowId;

    try {

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fieldName);
        TokenStream stream = analyzer.tokenStream(fieldName, new StringReader(fieldQuery));
        CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);

        Set<String> terms = new LinkedHashSet<String>();

        while (stream.incrementToken()) {
            terms.add(termAttribute.toString());
        }

        int termsT = terms.size();

        if (enableNGram) {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGRam Explosion");

            int subsequenceLen = 1;

            if (biWord)
                subsequenceLen = 2;
            else if (triWord)
                subsequenceLen = 3;

            /**
             * There may be an pentalty on performance.
             * Don't allow total search phrases > 10 
             */
            if (triWord && (termsT > 4))
                subsequenceLen = 2;
            if ((subsequenceLen == 2) && (termsT > 5))
                subsequenceLen = 1;

            /**
             * "red party gown"
             * "party gown dress"
             * "red party"
             * "party gown"
             * "gown dress"
             * "red"
             * "party"
             * "gown"
             * "dress"
             */
            List<String> phrases = new ArrayList<String>();
            StringBuilder sb = new StringBuilder(1024);

            String[] termsA = new String[terms.size()];
            terms.toArray(termsA);

            for (int subSequence = subsequenceLen; subSequence > 0; subSequence--) {
                if (subSequence <= 0)
                    break;

                for (int wordPosition = 0; wordPosition <= termsT - subSequence; wordPosition++) {

                    for (int pos = 0; pos < subSequence; pos++) {
                        if (pos > 0)
                            sb.append(' ');
                        sb.append(termsA[wordPosition + pos]);
                    }
                    phrases.add(sb.toString());
                    sb.setLength(0);
                }
            }

            for (String phrase : phrases) {
                BitSetOrSet phraseMatches = new BitSetOrSet();

                findATerm(checkForAllWords, isCompressed, isCached, phraseMatches, rowIdPrefix, phrase, false);
                destination.orQueryWithFoundIds.put(phrase, phraseMatches);
                destination.or(phraseMatches);
            }

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("NGram Query OR trace > " + destination.orQueryWithFoundIds.toString());

            return destination;

        } else {

            if (DEBUG_ENABLED)
                IdSearchLog.l.debug("Normal Query processing");

            //check for all words 

            BitSetOrSet highRanked = null;
            switch (termsT) {
            case 2: {

                /**
                 * All 2 words are consecutive
                 */
                if (biWord) {
                    Iterator<String> itr = terms.iterator();
                    String phrase = itr.next() + " " + itr.next();

                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /*
                 * Biword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 3: {

                /**
                 * All 3 words are consecutive
                 */
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();

                if (triWord) {
                    String phrase = word1 + " " + word2 + " " + word3;
                    findATerm(checkForAllWords, isCompressed, isCached, destination, rowIdPrefix, phrase, true);
                    BitSetWrapper result = destination.getDocumentSequences();
                    int resultT = (null == result) ? 0 : result.cardinality();
                    if (resultT > 0 || checkExactPhrase)
                        return destination;
                }

                /**
                 * If Check for all words is true minimum required result is 1 for three words
                 * else minimum required result is 0
                 */
                int requiredMinResult = checkForAllWords ? 1 : 0;

                /**
                 * 2 words are consecutive, take them and apply findAll on them
                 */
                if (biWord) {

                    String biword1 = word1 + " " + word2;
                    String biword2 = word2 + " " + word3;
                    String biword3 = word1 + " " + word3;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */
                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            case 4: {
                Iterator<String> itr = terms.iterator();
                String word1 = itr.next();
                String word2 = itr.next();
                String word3 = itr.next();
                String word4 = itr.next();
                int requiredMinResult = 0;
                if (triWord) {

                    requiredMinResult = checkForAllWords ? 1 : 0;

                    String triword1 = word1 + " " + word2 + " " + word3;
                    String triword2 = word1 + " " + word3 + " " + word4;
                    String triword3 = word2 + " " + word3 + " " + word4;
                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { triword1, triword2, triword3 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;
                }

                if (biWord) {

                    requiredMinResult = checkForAllWords ? 2 : 0;

                    String biword1 = word1 + " " + word2;
                    String biword2 = word1 + " " + word3;
                    String biword3 = word1 + " " + word4;
                    String biword4 = word2 + " " + word3;
                    String biword5 = word2 + " " + word4;

                    highRanked = new BitSetOrSet();
                    String[] phrases = new String[] { biword1, biword2, biword3, biword4, biword5 };

                    int found = 0;
                    for (String phrase : phrases) {
                        int result = findATerm(false, isCompressed, isCached, highRanked, rowIdPrefix, phrase,
                                false);
                        if (result > 0)
                            found++;
                    }

                    if (found > requiredMinResult || checkExactPhrase)
                        return highRanked;

                }

                /*
                 * Biword and Triword search result is 0 so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);

            }
            default: {
                /*
                 * Biword and Triword is not enabled so search for all words.
                 */

                return searchForAllWord(terms, checkForAllWords, isCompressed, isCached, destination,
                        rowIdPrefix);
            }
            }
        }
    } catch (Exception e) {
        String msg = "Error while processing query [" + fieldQuery + "]\n";
        IdSearchLog.l.fatal(this.getClass().getName() + ":\t" + msg);
        e.printStackTrace();
        throw new IOException(msg, e);
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public void mapFreeTextSet(Field fld, Context context) throws IOException, InterruptedException {

    terms.clear();/*  w w w.  j  a va  2 s . c om*/
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;

    int wordhash;
    String wordhashStr;
    char firstChar;
    char lastChar;

    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);

        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();
            wordhash = Hashing.hash(termWord);
            wordhashStr = new Integer(wordhash).toString();
            firstChar = wordhashStr.charAt(0);
            lastChar = wordhashStr.charAt(wordhashStr.length() - 1);

            rowKeyP1 = mergeId + "_" + firstChar + "_" + lastChar;
            appender.delete(0, appender.capacity());
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();
            appender.delete(0, appender.capacity());
            rowVal = appender.append(incrementalIdSeekPosition).append(KVIndexer.FIELD_SEPARATOR)
                    .append(wordhash).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:com.bizosys.hsearch.kv.indexing.KVMapperBase.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public final void mapFreeTextBitset(final Field fld, final Context context)
        throws IOException, InterruptedException {

    terms.clear();/*from   ww  w. j  av a2  s .  com*/
    CharTermAttribute termAttribute = null;
    TokenStream stream = null;
    try {
        if (isFieldNull)
            return;

        Analyzer analyzer = AnalyzerFactory.getInstance().getAnalyzer(fld.name);
        stream = analyzer.tokenStream(fld.name, new StringReader(fldValue));
        termAttribute = stream.getAttribute(CharTermAttribute.class);
        String last2 = null;
        String last1 = null;
        while (stream.incrementToken()) {
            String termWord = termAttribute.toString();

            if (0 == termWord.length())
                continue;

            appender.delete(0, appender.capacity());

            /**
             * Row Key is mergeidFIELDwordhashStr
             */
            boolean isEmpty = (null == mergeId) ? true : (mergeId.length() == 0);
            String rowKeyPrefix = (isEmpty) ? fld.name : mergeId + "_" + fld.name;

            rowKeyP1 = rowKeyPrefix + termWord;
            rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                    .toString();

            appender.setLength(0);
            rowVal = appender.append(incrementalIdSeekPosition).toString();

            context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));

            if (!fld.isBiWord && !fld.isTriWord)
                continue;

            /**
             * Do Three phrase word
             */
            if (null != last2) {
                appender.setLength(0);

                rowKeyP1 = appender.append(rowKeyPrefix).append(last2).append(' ').append(last1).append(' ')
                        .append(termWord).append(' ').append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            /**
             * Do Two phrase word
             */
            if (null != last1) {

                appender.setLength(0);
                rowKeyP1 = appender.append(rowKeyPrefix).append(last1).append(' ').append(termWord).append(' ')
                        .append('*').toString();

                appender.setLength(0);
                rowKeyP2 = appender.append("text").append(KVIndexer.FIELD_SEPARATOR).append(fld.sourceSeq)
                        .toString();

                context.write(new TextPair(rowKeyP1, rowKeyP2), new Text(rowVal));
            }

            last2 = last1;
            last1 = termWord;

        }
    } catch (Exception e) {
        e.printStackTrace();
        System.err.println("Error While tokenizing : " + e.getMessage());
    } finally {
        try {
            if (null != stream)
                stream.close();
        } catch (Exception ex) {
            IdSearchLog.l.warn("Error during Tokenizer Stream closure");
        }
    }
}

From source file:com.bizosys.unstructured.CustomAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "Abinash", Field.Store.NO, Field.Index.ANALYZED));
    Analyzer analyzer = new CustomAnalyzerExample();

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println(termA.toString());
        }// w w  w  . j  av  a 2s .c  o  m
        sr.close();
    }
}

From source file:com.bizosys.unstructured.IndexWriter.java

License:Apache License

/**
 * Find the last offset./*from   w w  w  . ja  v a 2  s  .c  om*/
 * Find each term offset
 * 
 * @param stream
 * @param docId
 * @param docType
 * @param fieldType
 * @param fieldBoost
 * @param codecs
 * @param uniqueTokens
 * @throws IOException
 */
private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType,
        Map<String, IndexRow> uniqueTokens) throws IOException {

    String token = null;
    int curoffset = 0;
    int lastoffset = 0;
    int position = -1;

    StringBuilder sb = new StringBuilder();
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {

        token = termA.toString();
        curoffset = offsetA.endOffset();

        if (lastoffset != curoffset)
            position++;
        lastoffset = curoffset;

        String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter);
        sb.delete(0, sb.capacity());

        if (uniqueTokens.containsKey(key)) {
            IndexRow existingRow = uniqueTokens.get(key);
            existingRow.set(curoffset, position);
            existingRow.occurance++;
        } else {
            IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position);
            if (null != filter)
                row.docMeta = filter;
            uniqueTokens.put(key, row);
        }
    }
    stream.end();
    stream.close();

    for (IndexRow row : uniqueTokens.values())
        cachedIndex.add(row);
}

From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {

    Document doc = new Document();
    doc.add(new Field("description", "dress/t-shirt dress for \"good boy\"", Field.Store.NO,
            Field.Index.ANALYZED));
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();

    for (Fieldable field : doc.getFields()) {
        String query = "dress/t-shirt dress for \"good boy\"";
        StringReader sr = new StringReader(query);
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);

        if (DEBUG_ENABLED) {
            while (stream.incrementToken()) {
                IdSearchLog.l.debug("Term:" + termA.toString());
            }/*from   w w w .j  av  a  2  s  .c  om*/
        }
        sr.close();
    }

    analyzer.close();

}

From source file:com.bizosys.unstructured.SynonumAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "bengalure is a good city", Field.Store.NO, Field.Index.ANALYZED));
    Map<String, String> syn = new HashMap<String, String>();
    syn.put("bangalore", "bengalure|bangaluru");
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();
    //analyzer.load(null, syn);

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println("Term:" + termA.toString());
        }/*from   w w w.j a  v a2 s. co  m*/
        sr.close();
    }
}

From source file:com.bull.aurocontrol.csst.poc.index.interval.InNumericIntervalQuery.java

License:Apache License

/**
 * Creates a query to find intervals a number is in.
 * @param name The name of the field to search.
 * @param value The search value.// w ww  .j  av  a2 s . c  o m
 * @param precisionStep The precision step used when indexing the field.
 */
public InNumericIntervalQuery(final String name, final long value, final int precisionStep) {
    super(true);
    this.value = value;

    TokenStream stream = new NumericTokenStream(precisionStep).setLongValue(value);

    try {
        stream.reset();
        while (stream.incrementToken()) {
            this.add(new TermQuery(new Term(name, stream.getAttribute(TermAttribute.class).term())),
                    BooleanClause.Occur.SHOULD);
        }
    } catch (IOException e) {
        throw new IllegalStateException("This should never happen - NumericTokenStream does no IO.");
    }
}

From source file:com.chimpler.example.bayes.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*from w  ww  . j a  v  a 2  s . co m*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}