Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream tokenStream = analyzer.tokenStream("contents", // #A
            new StringReader(text));
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = tokenStream.addAttribute(TypeAttribute.class); // #B

    while (tokenStream.incrementToken()) { // #C

        int startOffset = offsetAttribute.startOffset();
        System.out.println(startOffset);
        int endOffset = offsetAttribute.endOffset();
        System.out.println(endOffset);
        String term = charTermAttribute.toString();
        System.out.println(term);
        System.out.println(type.toString());
    }// w w  w  . ja va  2 s  .c  om
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));

    TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.getBytesRef().utf8ToString());
    }/*from  w ww.j  a  v a  2 s  .c o m*/
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("text", text);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }/*from w w  w.j a v a  2s.c  om*/
}

From source file:lia.chapter4.SimpleAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {

    TokenStream tokenStream = new SimpleAnalyzer().tokenStream("text", "The quick brown fox..");
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();//from  w  ww.jav  a  2 s  .  c  om
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        System.out.println(startOffset);
        int endOffset = offsetAttribute.endOffset();
        System.out.println(endOffset);
        String term = charTermAttribute.toString();
        System.out.println(term);

    }

    /*AnalyzerUtils.displayTokensWithFullDetails(new SimpleAnalyzer(),
      "The quick brown fox....");*/
}

From source file:lia.recent.CopyOfSearch.java

License:Apache License

  /** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {

   System.out.println("Hello");
   KoreanAnalyzer ka = new KoreanAnalyzer();
   TokenStream ts = ka.tokenStream("", new java.io.StringReader("? ? I an Hello"));
   System.out.println(ts.toString());
   try{/*  www . j av a  2  s. c  om*/
   while (ts.incrementToken()){
   org.apache.lucene.analysis.tokenattributes.TermAttribute ta = ts.getAttribute( org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
   System.out.println("adf"+ta.term());
   }
   }catch (Exception e){System.out.println(e.toString());}


   }

From source file:lucandra.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {

    List<String> allIndexedTerms = new ArrayList<String>();

    // check for special field name
    String docId = doc.get(CassandraUtils.documentIdField);

    if (docId == null)
        docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime())));

    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {

        // Untokenized fields go in without a termPosition

        if (field.isIndexed() && !field.isTokenized()) {

            String term = CassandraUtils.createColumnName(field.name(), field.stringValue());

            allIndexedTerms.add(term);//from w  ww .j a v a 2s  . c  o  m

            String key = indexName + CassandraUtils.delimeter + term;

            Map<String, List<Number>> termMap = new HashMap<String, List<Number>>();

            termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray);
            termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray);

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                    docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap);

        } else if (field.isIndexed()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }

            // collect term information per field
            Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms

            tokens.reset(); // reset the TokenStream to the first token

            // set up token attributes we are working on

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                String term = CassandraUtils.createColumnName(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<String, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new HashMap<String, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                {
                    List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey);

                    if (termFrequency == null) {
                        termFrequency = new ArrayList<Number>();
                        termFrequency.add(new Integer(0));
                        termInfo.put(CassandraUtils.termFrequencyKey, termFrequency);
                    }

                    // increment
                    termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                }

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.positionVectorKey, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.offsetVectorKey, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) {

                // Terms are stored within a unique key combination
                // This is required since cassandra loads all columns
                // in a key/column family into memory
                String key = indexName + CassandraUtils.delimeter + term.getKey();

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(CassandraUtils.normsKey, bnorm);
                }

                CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                        docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue());
            }
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            String key = indexName + CassandraUtils.delimeter + docId;

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
                    field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null);

        }
    }

    // Finally, Store meta-data so we can delete this document
    String key = indexName + CassandraUtils.delimeter + docId;

    CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
            CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key),
            CassandraUtils.toBytes(allIndexedTerms), null);

    if (autoCommit)
        CassandraUtils.robustBatchInsert(client, getMutationMap());
}

From source file:lucli.LuceneMethods.java

License:Apache License

private void invertDocument(Document doc) throws IOException {

    Map tokenMap = new HashMap();
    final int maxFieldLength = 10000;

    Analyzer analyzer = createAnalyzer();
    Iterator fields = doc.getFields().iterator();
    final Token reusableToken = new Token();
    while (fields.hasNext()) {
        Field field = (Field) fields.next();
        String fieldName = field.name();

        if (field.isIndexed()) {
            if (field.isTokenized()) { // un-tokenized field
                Reader reader; // find or make Reader
                if (field.readerValue() != null)
                    reader = field.readerValue();
                else if (field.stringValue() != null)
                    reader = new StringReader(field.stringValue());
                else
                    throw new IllegalArgumentException("field must have either String or Reader value");

                int position = 0;
                // Tokenize field and add to postingTable
                TokenStream stream = analyzer.tokenStream(fieldName, reader);
                TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
                PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream
                        .addAttribute(PositionIncrementAttribute.class);

                try {
                    while (stream.incrementToken()) {
                        position += (posIncrAtt.getPositionIncrement() - 1);
                        position++;//from w ww. j a  v  a 2  s  .c  o m
                        String name = termAtt.term();
                        Integer Count = (Integer) tokenMap.get(name);
                        if (Count == null) { // not in there yet
                            tokenMap.put(name, new Integer(1)); //first one
                        } else {
                            int count = Count.intValue();
                            tokenMap.put(name, new Integer(count + 1));
                        }
                        if (position > maxFieldLength)
                            break;
                    }
                } finally {
                    stream.close();
                }
            }

        }
    }
    Entry[] sortedHash = getSortedMapEntries(tokenMap);
    for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) {
        Entry currentEntry = sortedHash[ii];
        message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue());
    }
}

From source file:mahout.classifier.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;//from  w  w  w . ja va  2  s .  c  o m
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static void analyzerByStop(String sentence) throws Exception {
    Analyzer analyzer = new StopAnalyzer();

    TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();//from  w  w w .j  a v a  2  s  .c o  m
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream
                .getAttribute(CharTermAttribute.class);
        System.out.print(charTermAttribute.toString() + " ,");
    }

    analyzer.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static Set<String> analyzerByStandard(String sentence) throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
    try {//from   ww  w .j av  a  2 s  .co  m
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new HashSet<>();
        while (tokenStream.incrementToken()) {
            words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString());
        }

        return words;
    } finally {
        analyzer.close();
    }
}