Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizerTest.java

License:Apache License

private void assertTokenStream(final TokenStream stream, final String expectedStream) throws Exception {

    final String[] expectedTokens = expectedStream.split("/");
    int count = 0;
    for (final String expectedToken : expectedTokens) {
        final String[] attrs = expectedToken.split(",");
        assertTrue(stream.incrementToken());

        final String term = attrs[0];
        assertAttribute(count, "term", term, stream.getAttribute(CharTermAttribute.class).toString());

        if (attrs.length > 1) {
            final int so = Integer.parseInt(attrs[1]);
            assertAttribute(count, "startOffset", so, stream.getAttribute(OffsetAttribute.class).startOffset());

            if (attrs.length > 2) {
                final int eo = Integer.parseInt(attrs[2]);
                assertAttribute(count, "endOffset", eo, stream.getAttribute(OffsetAttribute.class).endOffset());

                if (attrs.length > 3) {
                    final int pi = Integer.parseInt(attrs[3]);
                    assertAttribute(count, "posInc", pi,
                            stream.getAttribute(PositionIncrementAttribute.class).getPositionIncrement());
                }// ww  w. j  a  v  a2  s  . c o  m
            }
        }
        count++;
    }
    assertFalse(stream.incrementToken());
}

From source file:lia.recent.CopyOfSearch.java

License:Apache License

  /** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {

   System.out.println("Hello");
   KoreanAnalyzer ka = new KoreanAnalyzer();
   TokenStream ts = ka.tokenStream("", new java.io.StringReader("? ? I an Hello"));
   System.out.println(ts.toString());
   try{//  w w  w .  j a  va 2 s.  c  om
   while (ts.incrementToken()){
   org.apache.lucene.analysis.tokenattributes.TermAttribute ta = ts.getAttribute( org.apache.lucene.analysis.tokenattributes.TermAttribute.class);
   System.out.println("adf"+ta.term());
   }
   }catch (Exception e){System.out.println(e.toString());}


   }

From source file:lux.index.analysis.QNameTokenFilter.java

License:Mozilla Public License

public final void reset(TokenStream inputAgain) {
    assert (input.getAttribute(CharTermAttribute.class) == inputAgain.getAttribute(CharTermAttribute.class));
}

From source file:lux.index.analysis.XmlTokenStreamBase.java

License:Mozilla Public License

public void reset(Reader reader) throws IOException {
    close();//w ww  . j  ava 2 s  .co m
    TokenStream reset = analyzer.tokenStream(fieldName, reader);
    // This must be the same token stream: ie the Analyzer must be re-usable, and the 
    // original token stream must have arisen from it.  We don't check for actual
    // identity with wrapped since that might get wrapped again (eg w/QNameTokenFilter).
    assert (reset.getAttribute(CharTermAttribute.class) == wrapped.getAttribute(CharTermAttribute.class));
}

From source file:mahout.classifier.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*from www .  j av  a 2 s .c o m*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static void analyzerByStop(String sentence) throws Exception {
    Analyzer analyzer = new StopAnalyzer();

    TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
    tokenStream.addAttribute(CharTermAttribute.class);
    tokenStream.reset();//w w  w  .  j  a  v a 2s . co  m
    while (tokenStream.incrementToken()) {
        CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream
                .getAttribute(CharTermAttribute.class);
        System.out.print(charTermAttribute.toString() + " ,");
    }

    analyzer.close();
}

From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java

License:Apache License

public static Set<String> analyzerByStandard(String sentence) throws Exception {
    Analyzer analyzer = new StandardAnalyzer();
    try {/* w w  w  . j  av a  2s  .  c  om*/
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new HashSet<>();
        while (tokenStream.incrementToken()) {
            words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString());
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java

License:Apache License

public static Set<String> analyzer(String sentence) throws Exception {
    if (StringUtils.isEmpty(sentence)) {
        return Collections.emptySet();
    }/*w  ww . j a  va 2 s  .  co  m*/

    Analyzer analyzer = new StandardAnalyzer();
    try {
        TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();

        Set<String> words = new LinkedHashSet<>();
        while (tokenStream.incrementToken()) {
            String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString();

            if (word.length() <= 1) {
                continue;
            }

            words.add(word);
        }

        return words;
    } finally {
        analyzer.close();
    }
}

From source file:mvm.rya.indexing.accumulo.freetext.LuceneTokenizer.java

License:Apache License

@Override
public SortedSet<String> tokenize(String string) {
    SortedSet<String> set = new TreeSet<String>();
    try {//w  w  w  . j av a 2 s .  co m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            set.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }

    return set;
}

From source file:net.mad.ads.server.utils.http.KeywordUtils.java

License:Open Source License

public static List<String> getTokens(String queryString) {
    try {/*from  ww w  . j a va 2 s . c  o  m*/
        GermanAnalyzer a = new GermanAnalyzer(Version.LUCENE_33);

        TokenStream ts = a.tokenStream("", new StringReader(queryString));

        List<String> tokens = new ArrayList<String>();

        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            String token = termAtt.toString();
            tokens.add(token);
        }
        ts.end();
        ts.close();

        return tokens;
    } catch (IOException e) {
        logger.error("", e);
    }
    return null;
}