Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {
    stream.reset();/*from w w w .ja  v a 2  s .  c  o m*/
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term + "] "); // B
    }
    stream.close();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    stream.reset();/*from  w  ww.  j  ava  2 s.  co m*/
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }

        System.out.print("[" + term + "] ");
    }
    stream.close();
    System.out.println();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));
    stream.reset();//from  w  w w.  ja v a  2s.com

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }

        System.out.print("[" + // #E
                term + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    stream.close();
    System.out.println();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));
    stream.reset();/*  w  w  w.j a va 2 s . c o m*/
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.toString());
    }
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    stream.reset();// w  ww  .  j  a  va  2  s . c  o m
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }
    stream.close();
}

From source file:com.sxc.lucene.analysis.synonym.SynonymAnalyzerTest.java

License:Apache License

public void testJumps() throws Exception {
    TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A
            new StringReader("jumps")); // #A
    stream.reset();//from w ww  . j av a  2s.  co  m
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int i = 0;
    String[] expected = new String[] { "jumps", // #B
            "hops", // #B
            "leaps" }; // #B
    while (stream.incrementToken()) {
        assertEquals(expected[i], term.toString());

        int expectedPos; // #C
        if (i == 0) { // #C
            expectedPos = 1; // #C
        } else { // #C
            expectedPos = 0; // #C
        } // #C
        assertEquals(expectedPos, // #C
                posIncr.getPositionIncrement()); // #C
        i++;
    }
    stream.close();
    assertEquals(3, i);
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(//from  ww  w  .  j  a  va2 s . c  o  m
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:com.weclay.ksearch2.BasicKoreanAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // text to tokenize
    //final String text = "  ? ?";
    //String text = " ,?, ?";
    String text = " ??.   . DB ?  ? ?? , ? ? , , ?,  ? ... ? ?  ? ? ?.";

    BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);

    stream.reset();//from ww w . j a  v a  2s.  co  m

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt + ": " + termAtt.length() + " (" + offsetAtt.startOffset() + ":"
                + offsetAtt.endOffset() + ")");
    }

    stream.end();
    stream.close();
}

From source file:com.wonders.xlab.healthcloud.IKAnalyzerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?truesmart??
    Analyzer analyzer = new IKAnalyzer(false);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {/*  w ww.  j ava 2s . co  m*/
        ts = analyzer.tokenStream("myfield",
                new StringReader("??????"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            //???
            if (term.toString().length() == 1) {
                continue;
            }
            //?????

            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:com.yourcompany.hadoop.mapreduce.lexical.LexicalAnalyzerMapper.java

License:Apache License

protected static List<String> collectExtractedNouns(TokenStream stream) throws IOException {
    CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class);
    List<String> extractedTokens = Lists.newArrayList();
    while (stream.incrementToken()) {
        extractedTokens.add(charTermAtt.toString());
    }//from  w  ww.j  a va2  s.c o  m
    return extractedTokens;
}