Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream tokenStream = analyzer.tokenStream("contents", // #A
            new StringReader(text));
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = tokenStream.addAttribute(TypeAttribute.class); // #B

    while (tokenStream.incrementToken()) { // #C

        int startOffset = offsetAttribute.startOffset();
        System.out.println(startOffset);
        int endOffset = offsetAttribute.endOffset();
        System.out.println(endOffset);
        String term = charTermAttribute.toString();
        System.out.println(term);
        System.out.println(type.toString());
    }// w w  w  . ja va  2 s  .c  om
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));

    TermToBytesRefAttribute termAttr = stream.addAttribute(TermToBytesRefAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.getBytesRef().utf8ToString());
    }/*from  w ww.j  a  v a  2 s  .c o m*/
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}

From source file:lia.chapter4.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("text", text);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }/*from w w  w.j a v a  2s.c  om*/
}

From source file:lia.chapter4.SimpleAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {

    TokenStream tokenStream = new SimpleAnalyzer().tokenStream("text", "The quick brown fox..");
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();//from  w  ww.jav  a  2 s  .  c  om
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        System.out.println(startOffset);
        int endOffset = offsetAttribute.endOffset();
        System.out.println(endOffset);
        String term = charTermAttribute.toString();
        System.out.println(term);

    }

    /*AnalyzerUtils.displayTokensWithFullDetails(new SimpleAnalyzer(),
      "The quick brown fox....");*/
}

From source file:lia.recent.CopyOfSearch.java

License:Apache License

  /** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {

   System.out.println("Hello");
   KoreanAnalyzer ka = new KoreanAnalyzer();
   TokenStream ts = ka.tokenStream("", new java.io.StringReader("? ? I an Hello"));
   System.out.println(ts.toString());
   try{/*  www . j av a  2  s. c  om*/
   while (ts.incrementToken()){
   org.apache.lucene.analysis.tokenattributes.TermAttribute ta = ts.getAttribute( org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
   System.out.println("adf"+ta.term());
   }
   }catch (Exception e){System.out.println(e.toString());}


   }

From source file:lucandra.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {

    List<String> allIndexedTerms = new ArrayList<String>();

    // check for special field name
    String docId = doc.get(CassandraUtils.documentIdField);

    if (docId == null)
        docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime())));

    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {

        // Untokenized fields go in without a termPosition

        if (field.isIndexed() && !field.isTokenized()) {

            String term = CassandraUtils.createColumnName(field.name(), field.stringValue());

            allIndexedTerms.add(term);//from w  ww .j a v a 2s  . c  o  m

            String key = indexName + CassandraUtils.delimeter + term;

            Map<String, List<Number>> termMap = new HashMap<String, List<Number>>();

            termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray);
            termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray);

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                    docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap);

        } else if (field.isIndexed()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }

            // collect term information per field
            Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms

            tokens.reset(); // reset the TokenStream to the first token

            // set up token attributes we are working on

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                String term = CassandraUtils.createColumnName(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<String, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new HashMap<String, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                {
                    List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey);

                    if (termFrequency == null) {
                        termFrequency = new ArrayList<Number>();
                        termFrequency.add(new Integer(0));
                        termInfo.put(CassandraUtils.termFrequencyKey, termFrequency);
                    }

                    // increment
                    termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                }

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.positionVectorKey, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.offsetVectorKey, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) {

                // Terms are stored within a unique key combination
                // This is required since cassandra loads all columns
                // in a key/column family into memory
                String key = indexName + CassandraUtils.delimeter + term.getKey();

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(CassandraUtils.normsKey, bnorm);
                }

                CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                        docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue());
            }
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            String key = indexName + CassandraUtils.delimeter + docId;

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
                    field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null);

        }
    }

    // Finally, Store meta-data so we can delete this document
    String key = indexName + CassandraUtils.delimeter + docId;

    CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
            CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key),
            CassandraUtils.toBytes(allIndexedTerms), null);

    if (autoCommit)
        CassandraUtils.robustBatchInsert(client, getMutationMap());
}

From source file:lucli.LuceneMethods.java

License:Apache License

private void invertDocument(Document doc) throws IOException {

    Map tokenMap = new HashMap();
    final int maxFieldLength = 10000;

    Analyzer analyzer = createAnalyzer();
    Iterator fields = doc.getFields().iterator();
    final Token reusableToken = new Token();
    while (fields.hasNext()) {
        Field field = (Field) fields.next();
        String fieldName = field.name();

        if (field.isIndexed()) {
            if (field.isTokenized()) { // un-tokenized field
                Reader reader; // find or make Reader
                if (field.readerValue() != null)
                    reader = field.readerValue();
                else if (field.stringValue() != null)
                    reader = new StringReader(field.stringValue());
                else
                    throw new IllegalArgumentException("field must have either String or Reader value");

                int position = 0;
                // Tokenize field and add to postingTable
                TokenStream stream = analyzer.tokenStream(fieldName, reader);
                TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
                PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream
                        .addAttribute(PositionIncrementAttribute.class);

                try {
                    while (stream.incrementToken()) {
                        position += (posIncrAtt.getPositionIncrement() - 1);
                        position++;//from w ww. j a  v  a 2  s  .c  o m
                        String name = termAtt.term();
                        Integer Count = (Integer) tokenMap.get(name);
                        if (Count == null) { // not in there yet
                            tokenMap.put(name, new Integer(1)); //first one
                        } else {
                            int count = Count.intValue();
                            tokenMap.put(name, new Integer(count + 1));
                        }
                        if (position > maxFieldLength)
                            break;
                    }
                } finally {
                    stream.close();
                }
            }

        }
    }
    Entry[] sortedHash = getSortedMapEntries(tokenMap);
    for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) {
        Entry currentEntry = sortedHash[ii];
        message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue());
    }
}

From source file:mahout.classifier.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;//from  w  w  w . ja va  2  s .  c  o m
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static void analyzerByStop(String sentence) throws Exception {
    Analyzer analyzer = new StopAnalyzer();

    TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();//from  w  w w .j  a v a  2  s  .c o  m
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream
                .getAttribute(CharTermAttribute.class);
        System.out.print(charTermAttribute.toString() + " ,");
    }

    analyzer.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static Set<String> analyzerByStandard(String sentence) throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
    try {//from   ww  w .j av  a  2 s  .co  m
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new HashSet<>();
        while (tokenStream.incrementToken()) {
            words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString());
        }

        return words;
    } finally {
        analyzer.close();
    }
}