Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.bizosys.unstructured.IndexWriter.java

License:Apache License

/**
 * Find the last offset.//from  w ww.j  a  v a2 s .c o m
 * Find each term offset
 * 
 * @param stream
 * @param docId
 * @param docType
 * @param fieldType
 * @param fieldBoost
 * @param codecs
 * @param uniqueTokens
 * @throws IOException
 */
private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType,
        Map<String, IndexRow> uniqueTokens) throws IOException {

    String token = null;
    int curoffset = 0;
    int lastoffset = 0;
    int position = -1;

    StringBuilder sb = new StringBuilder();
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {

        token = termA.toString();
        curoffset = offsetA.endOffset();

        if (lastoffset != curoffset)
            position++;
        lastoffset = curoffset;

        String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter);
        sb.delete(0, sb.capacity());

        if (uniqueTokens.containsKey(key)) {
            IndexRow existingRow = uniqueTokens.get(key);
            existingRow.set(curoffset, position);
            existingRow.occurance++;
        } else {
            IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position);
            if (null != filter)
                row.docMeta = filter;
            uniqueTokens.put(key, row);
        }
    }
    stream.end();
    stream.close();

    for (IndexRow row : uniqueTokens.values())
        cachedIndex.add(row);
}

From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {

    Document doc = new Document();
    doc.add(new Field("description", "dress/t-shirt dress for \"good boy\"", Field.Store.NO,
            Field.Index.ANALYZED));
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();

    for (Fieldable field : doc.getFields()) {
        String query = "dress/t-shirt dress for \"good boy\"";
        StringReader sr = new StringReader(query);
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);

        if (DEBUG_ENABLED) {
            while (stream.incrementToken()) {
                IdSearchLog.l.debug("Term:" + termA.toString());
            }/*  ww w .  j av a2s.  co  m*/
        }
        sr.close();
    }

    analyzer.close();

}

From source file:com.bizosys.unstructured.SynonumAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "bengalure is a good city", Field.Store.NO, Field.Index.ANALYZED));
    Map<String, String> syn = new HashMap<String, String>();
    syn.put("bangalore", "bengalure|bangaluru");
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();
    //analyzer.load(null, syn);

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println("Term:" + termA.toString());
        }/*from ww  w.  jav a  2 s.co  m*/
        sr.close();
    }
}

From source file:com.bull.aurocontrol.csst.poc.index.interval.InNumericIntervalQuery.java

License:Apache License

/**
 * Creates a query to find intervals a number is in.
 * @param name The name of the field to search.
 * @param value The search value.//from  w  w w .j a va 2 s .  c o m
 * @param precisionStep The precision step used when indexing the field.
 */
public InNumericIntervalQuery(final String name, final long value, final int precisionStep) {
    super(true);
    this.value = value;

    TokenStream stream = new NumericTokenStream(precisionStep).setLongValue(value);

    try {
        stream.reset();
        while (stream.incrementToken()) {
            this.add(new TermQuery(new Term(name, stream.getAttribute(TermAttribute.class).term())),
                    BooleanClause.Occur.SHOULD);
        }
    } catch (IOException e) {
        throw new IllegalStateException("This should never happen - NumericTokenStream does no IO.");
    }
}

From source file:com.chimpler.example.bayes.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;//from   w  w  w. j a va2s.  c o m
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:com.chriscx.stem.Stem.java

public String evaluate(BufferedReader input) {
    if (input == null) {
        return null;
    }/*from ww  w .ja v  a  2s.co m*/

    CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true);
    String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi "
            + "autre autres aux auxquelles auxquels avait avant avec c car ce "
            + "ceci cela celle celles celui cependant certain certaine certaines "
            + "certains ces cet cette ceux chez ci combien comme comment "
            + "concernant contre d dans de debout dedans dehors del depuis "
            + "derrire des dsormais desquelles desquels devers devra doit "
            + "donc dont du duquel durant ds elle elles en entre environ est"
            + " et etc eu eux except hormis hors hlas hui il ils j je jusqu "
            + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs "
            + "lorsque lui l ma mais malgr me merci mes mien mienne miennes "
            + "miens moins mon moyennant mme mmes n ne ni non nos notre nous "
            + "nanmoins ntre ntres on ont ou outre o par parmi partant pas "
            + "pass pendant plein plus plusieurs pour pourquoi proche prs "
            + "puisque qu quand que quel quelle quelles quels qui quoi quoique"
            + " revoici revoil s sa sans sauf se selon seront ses si sien "
            + "sienne siennes siens sinon soi soit son sont sous suivant sur "
            + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes"
            + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre ";
    String[] stopWordsTab = stopWords.split(" ");
    for (String word : stopWordsTab) {
        stopWordsSet.add(word);
    }

    Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet);

    result = "";
    try {
        String line = input.readLine();

        line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", "");
        line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
        line = line.replaceAll("(_|-)+", "");
        line = line.replaceAll("(\\n|\\r|\\t)+", "");
        line = line.replaceAll("(?![\\._])\\p{P}", "");
        while (line != null) {

            TokenStream stream = analyzer.tokenStream(null, line);
            stream.reset();
            while (stream.incrementToken()) {
                String wordset = stream.getAttribute(CharTermAttribute.class).toString();
                wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
                result += wordset + " ";
            }
            result += "\n";
            stream.close();
            line = input.readLine();
        }

        input.close();
        return result;
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
}

From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java

License:Apache License

/**
 * //from w ww .  ja va 2  s. com
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    System.out.println("> ----- countWords ------");

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        System.out.print(" " + s);
        words.add(s);
    }

    System.out.println("\n<");

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);/*from  w  w  w . j  ava2 s. c  o m*/
    }

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java

License:Apache License

/**
 * //  www. j av  a2 s  . co  m
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    //System.out.println( "> ----- countWords ------" );

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        //System.out.print( " " + s );
        words.add(s);
    }

    //System.out.println( "\n<" );

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        // System.out.print( " " + s );
        words.add(s);/*from   w w w .j a v  a  2 s  . c  o m*/
    }

}