Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:lucandra.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {

    List<String> allIndexedTerms = new ArrayList<String>();

    // check for special field name
    String docId = doc.get(CassandraUtils.documentIdField);

    if (docId == null)
        docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime())));

    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {

        // Untokenized fields go in without a termPosition

        if (field.isIndexed() && !field.isTokenized()) {

            String term = CassandraUtils.createColumnName(field.name(), field.stringValue());

            allIndexedTerms.add(term);/*ww w.  j av  a  2 s .  co m*/

            String key = indexName + CassandraUtils.delimeter + term;

            Map<String, List<Number>> termMap = new HashMap<String, List<Number>>();

            termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray);
            termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray);

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                    docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap);

        } else if (field.isIndexed()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }

            // collect term information per field
            Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms

            tokens.reset(); // reset the TokenStream to the first token

            // set up token attributes we are working on

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                String term = CassandraUtils.createColumnName(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<String, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new HashMap<String, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                {
                    List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey);

                    if (termFrequency == null) {
                        termFrequency = new ArrayList<Number>();
                        termFrequency.add(new Integer(0));
                        termInfo.put(CassandraUtils.termFrequencyKey, termFrequency);
                    }

                    // increment
                    termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                }

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.positionVectorKey, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.offsetVectorKey, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) {

                // Terms are stored within a unique key combination
                // This is required since cassandra loads all columns
                // in a key/column family into memory
                String key = indexName + CassandraUtils.delimeter + term.getKey();

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(CassandraUtils.normsKey, bnorm);
                }

                CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                        docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue());
            }
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            String key = indexName + CassandraUtils.delimeter + docId;

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
                    field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null);

        }
    }

    // Finally, Store meta-data so we can delete this document
    String key = indexName + CassandraUtils.delimeter + docId;

    CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
            CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key),
            CassandraUtils.toBytes(allIndexedTerms), null);

    if (autoCommit)
        CassandraUtils.robustBatchInsert(client, getMutationMap());
}

From source file:lux.search.highlight.XmlHighlighter.java

License:Mozilla Public License

private void init(TokenStream tokenStream) {
    try {/*from w  w  w .  j av  a2 s . c  o m*/
        tokenStream.reset();
        scorer.setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
        scorerTokens = scorer.init(tokenStream);
        if (scorerTokens == null) {
            // The scorer didn't consume any tokens (it does that for PhraseQuery),
            // in which case we must give it the live token stream
            scorer.init(xmlStreamTokens);
        }
        // we score the entire document as a single fragment
        scorer.startFragment(new TextFragment("", 0, 0));
    } catch (IOException e) {
        throw new LuxException(e);
    }
}

From source file:mahout.classifier.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*  w w  w.j  av  a2s . com*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static void analyzerByStop(String sentence) throws Exception {
    Analyzer analyzer = new StopAnalyzer();

    TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream
                .getAttribute(CharTermAttribute.class);
        System.out.print(charTermAttribute.toString() + " ,");
    }/*from  w  w w  .j  a  v  a  2 s  .  co  m*/

    analyzer.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static Set<String> analyzerByStandard(String sentence) throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
    try {/*w w w .  j  av  a  2 s.c om*/
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new HashSet<>();
        while (tokenStream.incrementToken()) {
            words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString());
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java

License:Apache License

public static Set<String> analyzer(String sentence) throws Exception {
    if (StringUtils.isEmpty(sentence)) {
        return Collections.emptySet();
    }/*from   w  w w  . ja  va  2 s  .  c om*/

    Analyzer analyzer = new StandardAnalyzer();
    try {
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new LinkedHashSet<>();
        while (tokenStream.incrementToken()) {
            String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString();

            if (word.length() <= 1) {
                continue;
            }

            words.add(word);
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:mvm.rya.indexing.accumulo.freetext.LuceneTokenizer.java

License:Apache License

@Override
public SortedSet<String> tokenize(String string) {
    SortedSet<String> set = new TreeSet<String>();
    try {/*  www  . j  a v  a  2 s.c o  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            set.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }

    return set;
}

From source file:net.mad.ads.server.utils.http.KeywordUtils.java

License:Open Source License

public static List<String> getTokens(String queryString) {
    try {//w  ww .  j  ava2  s. c  o  m
        GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33);

        TokenStream ts = a.tokenStream("", new StringReader(queryString));

        List<String> tokens = new ArrayList<String>();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String token = termAtt.toString();
            tokens.add(token);
        }
        ts.end();
        ts.close();

        return tokens;
    } catch (IOException e) {
        logger.error("", e);
    }
    return null;
}

From source file:net.sf.logsaw.index.internal.LuceneIndexServiceImpl.java

License:Open Source License

private void fillPhraseQuery(PhraseQuery phrase, Analyzer analyzer, String fld, String val) throws IOException {
    TokenStream ts = analyzer.tokenStream(fld, new StringReader(val));
    try {//from   w  w  w.j av  a 2  s  .  c  o  m
        ts.reset();
        // Iterate over tokens and treat each token as term
        int pos = 0;
        while (ts.incrementToken()) {
            CharTermAttribute t = ts.getAttribute(CharTermAttribute.class);
            PositionIncrementAttribute p = ts.getAttribute(PositionIncrementAttribute.class);
            pos += p.getPositionIncrement();
            phrase.add(new Term(fld, t.toString()), pos - 1);
        }
        // End-of-stream clean-up
        ts.end();
    } finally {
        ts.close();
    }
}

From source file:net.sf.okapi.lib.tmdb.lucene.Seeker.java

License:Open Source License

public List<TmHit> searchFuzzy(String genericText, String codesAsString, String tmId, String locale, int max,
        int threshold, HashMap<String, String> attributes) {
    float searchThreshold = (float) threshold;
    if (threshold < 0)
        searchThreshold = 0.0f;/*from ww w .  j ava  2s.co  m*/
    if (threshold > 100)
        searchThreshold = 100.0f;

    String queryText = genericText;

    String gtextFName = TmEntry.GTEXT_PREFIX + locale;
    Locale javaLoc = new Locale(locale);

    // create basic ngram analyzer to tokenize query
    TokenStream queryTokenStream;
    if (javaLoc.getLanguage() == Locale.ENGLISH.getLanguage()) {
        queryTokenStream = defaultFuzzyAnalyzer.tokenStream(gtextFName, new StringReader(queryText));
    } else {
        queryTokenStream = new NgramAnalyzer(javaLoc, 4).tokenStream(gtextFName, new StringReader(queryText));
    }

    // Get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) queryTokenStream.addAttribute(CharTermAttribute.class);
    TmFuzzyQuery fQuery = new TmFuzzyQuery(searchThreshold, gtextFName);

    try {
        queryTokenStream.reset();
        while (queryTokenStream.incrementToken()) {
            //Term t = new Term(keyIndexField, new String(termAtt.buffer()));
            Term t = new Term(gtextFName, termAtt.toString());
            fQuery.add(t);
        }
        queryTokenStream.end();
        queryTokenStream.close();
    } catch (IOException e) {
        throw new OkapiIOException(e.getMessage(), e);
    }

    return getFuzzyHits(fQuery, genericText, codesAsString, tmId, locale, max, searchThreshold, attributes);
}