Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java

License:Apache License

private void assertTokenStream(final TokenStream stream, final String expectedStream) throws Exception {

    final String[] expectedTokens = expectedStream.split("/");
    int count = 0;
    for (final String expectedToken : expectedTokens) {
        final String[] attrs = expectedToken.split(",");
        assertTrue(stream.incrementToken());

        final String term = attrs[0];
        assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString());

        if (attrs.length > 1) {
            final int so = Integer.parseInt(attrs[1]);
            assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset());

            if (attrs.length > 2) {
                final int eo = Integer.parseInt(attrs[2]);
                assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset());

                if (attrs.length > 3) {
                    final int pi = Integer.parseInt(attrs[3]);
                    assertAttribute(count, "posInc", pi,
                            stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                }// ww  w. j  a  v  a2  s  . c o  m
            }
        }
        count++;
    }
    assertFalse(stream.incrementToken());
}

From source file:lia.recent.CopyOfSearch.java

License:Apache License

  /** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {

   System.out.println("Hello");
   KoreanAnalyzer ka = new KoreanAnalyzer();
   TokenStream ts = ka.tokenStream("", new java.io.StringReader("? ? I an Hello"));
   System.out.println(ts.toString());
   try{//  w w  w .  j a  va 2 s.  c  om
   while (ts.incrementToken()){
   org.apache.lucene.analysis.tokenattributes.TermAttribute ta = ts.getAttribute( org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
   System.out.println("adf"+ta.term());
   }
   }catch (Exception e){System.out.println(e.toString());}


   }

From source file:lux.index.analysis.QNameTokenFilter.java

License:Mozilla Public License

public final void reset(TokenStream inputAgain) {
    assert (input.getAttribute(CharTermAttribute.class) == inputAgain.getAttribute(CharTermAttribute.class));
}

From source file:lux.index.analysis.XmlTokenStreamBase.java

License:Mozilla Public License

public void reset(Reader reader) throws IOException {
    close();//w ww  . j  ava 2 s  .co m
    TokenStream reset = analyzer.tokenStream(fieldName, reader);
    // This must be the same token stream: ie the Analyzer must be re-usable, and the 
    // original token stream must have arisen from it.  We don't check for actual
    // identity with wrapped since that might get wrapped again (eg w/QNameTokenFilter).
    assert (reset.getAttribute(CharTermAttribute.class) == wrapped.getAttribute(CharTermAttribute.class));
}

From source file:mahout.classifier.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*from www .  j av  a 2 s .c o m*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static void analyzerByStop(String sentence) throws Exception {
    Analyzer analyzer = new StopAnalyzer();

    TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();//w w  w  .  j  a  v a 2s . co  m
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream
                .getAttribute(CharTermAttribute.class);
        System.out.print(charTermAttribute.toString() + " ,");
    }

    analyzer.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static Set<String> analyzerByStandard(String sentence) throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
    try {/* w w  w  . j  av a  2s  .  c  om*/
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new HashSet<>();
        while (tokenStream.incrementToken()) {
            words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString());
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java

License:Apache License

public static Set<String> analyzer(String sentence) throws Exception {
    if (StringUtils.isEmpty(sentence)) {
        return Collections.emptySet();
    }/*w  ww . j a  va 2 s  .  co  m*/

    Analyzer analyzer = new StandardAnalyzer();
    try {
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new LinkedHashSet<>();
        while (tokenStream.incrementToken()) {
            String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString();

            if (word.length() <= 1) {
                continue;
            }

            words.add(word);
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:mvm.rya.indexing.accumulo.freetext.LuceneTokenizer.java

License:Apache License

@Override
public SortedSet<String> tokenize(String string) {
    SortedSet<String> set = new TreeSet<String>();
    try {//w  w  w  . j av a 2 s .  co m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            set.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }

    return set;
}

From source file:net.mad.ads.server.utils.http.KeywordUtils.java

License:Open Source License

public static List<String> getTokens(String queryString) {
    try {/*from  ww w  . j a va 2 s . c  o  m*/
        GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33);

        TokenStream ts = a.tokenStream("", new StringReader(queryString));

        List<String> tokens = new ArrayList<String>();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String token = termAtt.toString();
            tokens.add(token);
        }
        ts.end();
        ts.close();

        return tokens;
    } catch (IOException e) {
        logger.error("", e);
    }
    return null;
}