Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {
    stream.reset();/*from w w w .ja  v a 2  s .  c  o m*/
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term + "] "); // B
    }
    stream.close();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    stream.reset();/*from  w  ww.  j  ava  2 s.  co m*/
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }

        System.out.print("[" + term + "] ");
    }
    stream.close();
    System.out.println();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", // #A
            new StringReader(text));
    stream.reset();//from  w  w w.  ja v a  2s.com

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); // #B
    PositionIncrementAttribute posIncr = // #B
            stream.addAttribute(PositionIncrementAttribute.class); // #B
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); // #B
    TypeAttribute type = stream.addAttribute(TypeAttribute.class); // #B

    int position = 0;
    while (stream.incrementToken()) { // #C

        int increment = posIncr.getPositionIncrement(); // #D
        if (increment > 0) { // #D
            position = position + increment; // #D
            System.out.println(); // #D
            System.out.print(position + ": "); // #D
        }

        System.out.print("[" + // #E
                term + ":" + // #E
                offset.startOffset() + "->" + // #E
                offset.endOffset() + ":" + // #E
                type.type() + "] "); // #E
    }
    stream.close();
    System.out.println();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void assertAnalyzesTo(Analyzer analyzer, String input, String[] output) throws Exception {
    TokenStream stream = analyzer.tokenStream("field", new StringReader(input));
    stream.reset();/*  w  w  w.j a va 2 s . c o m*/
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    for (String expected : output) {
        Assert.assertTrue(stream.incrementToken());
        Assert.assertEquals(expected, termAttr.toString());
    }
    Assert.assertFalse(stream.incrementToken());
    stream.close();
}

From source file:com.sxc.lucene.analysis.AnalyzerUtils.java

License:Apache License

public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException {
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    stream.reset();// w  ww  .  j  a  va  2  s . c  o m
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    while (stream.incrementToken()) {
        System.out.println("posIncr=" + posIncr.getPositionIncrement());
    }
    stream.close();
}

From source file:com.sxc.lucene.analysis.synonym.SynonymAnalyzerTest.java

License:Apache License

public void testJumps() throws Exception {
    TokenStream stream = synonymAnalyzer.tokenStream("contents", // #A
            new StringReader("jumps")); // #A
    stream.reset();//from w ww  . j av a  2s.  co  m
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);

    int i = 0;
    String[] expected = new String[] { "jumps", // #B
            "hops", // #B
            "leaps" }; // #B
    while (stream.incrementToken()) {
        assertEquals(expected[i], term.toString());

        int expectedPos; // #C
        if (i == 0) { // #C
            expectedPos = 1; // #C
        } else { // #C
            expectedPos = 0; // #C
        } // #C
        assertEquals(expectedPos, // #C
                posIncr.getPositionIncrement()); // #C
        i++;
    }
    stream.close();
    assertEquals(3, i);
}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(//from  ww  w  .  j  a  va2 s . c  o  m
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:com.weclay.ksearch2.BasicKoreanAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    // text to tokenize
    //final String text = "  ? ?";
    //String text = " ,?, ?";
    String text = " ??.   . DB ?  ? ?? , ? ? , , ?,  ? ... ? ?  ? ? ?.";

    BasicKoreanAnalyzer analyzer = new BasicKoreanAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);

    stream.reset();//from ww w . j a  v a  2s.  co  m

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt + ": " + termAtt.length() + " (" + offsetAtt.startOffset() + ":"
                + offsetAtt.endOffset() + ")");
    }

    stream.end();
    stream.close();
}

From source file:com.wonders.xlab.healthcloud.IKAnalyzerDemo.java

License:Apache License

public static void main(String[] args) {
    //IK?truesmart??
    Analyzer analyzer = new IKAnalyzer(false);

    //?LuceneTokenStream
    TokenStream ts = null;
    try {/*  w ww.  j ava 2s . co  m*/
        ts = analyzer.tokenStream("myfield",
                new StringReader("??????"));
        //???
        OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class);
        //??
        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        //??
        TypeAttribute type = ts.addAttribute(TypeAttribute.class);

        //?TokenStream?StringReader
        ts.reset();
        //??
        while (ts.incrementToken()) {
            //???
            if (term.toString().length() == 1) {
                continue;
            }
            //?????

            System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString()
                    + " | " + type.type());
        }
        //TokenStreamStringReader
        ts.end(); // Perform end-of-stream operations, e.g. set the final offset.

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        //TokenStream?
        if (ts != null) {
            try {
                ts.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

From source file:com.yourcompany.hadoop.mapreduce.lexical.LexicalAnalyzerMapper.java

License:Apache License

protected static List<String> collectExtractedNouns(TokenStream stream) throws IOException {
    CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class);
    List<String> extractedTokens = Lists.newArrayList();
    while (stream.incrementToken()) {
        extractedTokens.add(charTermAtt.toString());
    }//from  w  ww.j  a va2  s.c o  m
    return extractedTokens;
}