Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:com.bizosys.unstructured.IndexWriter.java

License:Apache License

/**
 * Find the last offset.//from  w ww.j  a  v a2 s .c o m
 * Find each term offset
 * 
 * @param stream
 * @param docId
 * @param docType
 * @param fieldType
 * @param fieldBoost
 * @param codecs
 * @param uniqueTokens
 * @throws IOException
 */
private final void tokenize(TokenStream stream, int docId, int docType, DocumentMetadata filter, int fieldType,
        Map<String, IndexRow> uniqueTokens) throws IOException {

    String token = null;
    int curoffset = 0;
    int lastoffset = 0;
    int position = -1;

    StringBuilder sb = new StringBuilder();
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);

    while (stream.incrementToken()) {

        token = termA.toString();
        curoffset = offsetA.endOffset();

        if (lastoffset != curoffset)
            position++;
        lastoffset = curoffset;

        String key = IndexRow.generateKey(sb, docId, token, docType, fieldType, filter);
        sb.delete(0, sb.capacity());

        if (uniqueTokens.containsKey(key)) {
            IndexRow existingRow = uniqueTokens.get(key);
            existingRow.set(curoffset, position);
            existingRow.occurance++;
        } else {
            IndexRow row = new IndexRow(docId, token, docType, fieldType, curoffset, position);
            if (null != filter)
                row.docMeta = filter;
            uniqueTokens.put(key, row);
        }
    }
    stream.end();
    stream.close();

    for (IndexRow row : uniqueTokens.values())
        cachedIndex.add(row);
}

From source file:com.bizosys.unstructured.StopwordAndSynonymAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {

    Document doc = new Document();
    doc.add(new Field("description", "dress/t-shirt dress for \"good boy\"", Field.Store.NO,
            Field.Index.ANALYZED));
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();

    for (Fieldable field : doc.getFields()) {
        String query = "dress/t-shirt dress for \"good boy\"";
        StringReader sr = new StringReader(query);
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);

        if (DEBUG_ENABLED) {
            while (stream.incrementToken()) {
                IdSearchLog.l.debug("Term:" + termA.toString());
            }/*  ww w .  j av a2s.  co  m*/
        }
        sr.close();
    }

    analyzer.close();

}

From source file:com.bizosys.unstructured.SynonumAnalyzerExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Document doc = new Document();
    doc.add(new Field("description", "bengalure is a good city", Field.Store.NO, Field.Index.ANALYZED));
    Map<String, String> syn = new HashMap<String, String>();
    syn.put("bangalore", "bengalure|bangaluru");
    Analyzer analyzer = new StopwordAndSynonymAnalyzer();
    //analyzer.load(null, syn);

    for (Fieldable field : doc.getFields()) {
        StringReader sr = new StringReader(field.stringValue());
        TokenStream stream = analyzer.tokenStream(field.name(), sr);
        CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
        while (stream.incrementToken()) {
            System.out.println("Term:" + termA.toString());
        }/*from ww  w.  jav a  2 s.co  m*/
        sr.close();
    }
}

From source file:com.bull.aurocontrol.csst.poc.index.interval.InNumericIntervalQuery.java

License:Apache License

/**
 * Creates a query to find intervals a number is in.
 * @param name The name of the field to search.
 * @param value The search value.//from  w  w w .j a va 2 s .  c o m
 * @param precisionStep The precision step used when indexing the field.
 */
public InNumericIntervalQuery(final String name, final long value, final int precisionStep) {
    super(true);
    this.value = value;

    TokenStream stream = new NumericTokenStream(precisionStep).setLongValue(value);

    try {
        stream.reset();
        while (stream.incrementToken()) {
            this.add(new TermQuery(new Term(name, stream.getAttribute(TermAttribute.class).term())),
                    BooleanClause.Occur.SHOULD);
        }
    } catch (IOException e) {
        throw new IllegalStateException("This should never happen - NumericTokenStream does no IO.");
    }
}

From source file:com.chimpler.example.bayes.Classifier.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;//from   w  w  w. j a va2s.  c o m
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:com.chriscx.stem.Stem.java

public String evaluate(BufferedReader input) {
    if (input == null) {
        return null;
    }/*from ww  w .ja v  a  2s.co m*/

    CharArraySet stopWordsSet = new CharArraySet(Version.LUCENE_46, 10000, true);
    String stopWords = "a afin ai ainsi aprs attendu au aujourd auquel aussi "
            + "autre autres aux auxquelles auxquels avait avant avec c car ce "
            + "ceci cela celle celles celui cependant certain certaine certaines "
            + "certains ces cet cette ceux chez ci combien comme comment "
            + "concernant contre d dans de debout dedans dehors del depuis "
            + "derrire des dsormais desquelles desquels devers devra doit "
            + "donc dont du duquel durant ds elle elles en entre environ est"
            + " et etc eu eux except hormis hors hlas hui il ils j je jusqu "
            + "jusque l la laquelle le lequel les lesquelles lesquels leur leurs "
            + "lorsque lui l ma mais malgr me merci mes mien mienne miennes "
            + "miens moins mon moyennant mme mmes n ne ni non nos notre nous "
            + "nanmoins ntre ntres on ont ou outre o par parmi partant pas "
            + "pass pendant plein plus plusieurs pour pourquoi proche prs "
            + "puisque qu quand que quel quelle quelles quels qui quoi quoique"
            + " revoici revoil s sa sans sauf se selon seront ses si sien "
            + "sienne siennes siens sinon soi soit son sont sous suivant sur "
            + "ta te tes tien tienne tiennes tiens ton tous tout toute toutes"
            + " tu un une va vers voici voil vos votre vous vu vtre vtres y " + " t tre ";
    String[] stopWordsTab = stopWords.split(" ");
    for (String word : stopWordsTab) {
        stopWordsSet.add(word);
    }

    Analyzer analyzer = new FrenchAnalyzer(Version.LUCENE_46, stopWordsSet);

    result = "";
    try {
        String line = input.readLine();

        line = line.replaceAll("(\\S)+@(\\S)+.(\\S)+", "");
        line = line.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
        line = line.replaceAll("(_|-)+", "");
        line = line.replaceAll("(\\n|\\r|\\t)+", "");
        line = line.replaceAll("(?![\\._])\\p{P}", "");
        while (line != null) {

            TokenStream stream = analyzer.tokenStream(null, line);
            stream.reset();
            while (stream.incrementToken()) {
                String wordset = stream.getAttribute(CharTermAttribute.class).toString();
                wordset = wordset.replaceAll("(0[0-68]([-. ]?\\d{2}){4}[-. ]?)|\\d+", "");
                result += wordset + " ";
            }
            result += "\n";
            stream.close();
            line = input.readLine();
        }

        input.close();
        return result;
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
}

From source file:com.cloudera.knittingboar.records.Test20NewsgroupsBookParsing.java

License:Apache License

/**
 * //from w ww .  ja va 2  s. com
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    System.out.println("> ----- countWords ------");

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        System.out.print(" " + s);
        words.add(s);
    }

    System.out.println("\n<");

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.records.TwentyNewsgroupsRecordFactory.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);/*from  w  w  w . j  ava2 s. c  o m*/
    }

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLR_Train20Newsgroups.java

License:Apache License

/**
 * //  www. j av  a2 s  . co  m
 * Counts words
 * 
 * @param analyzer
 * @param words
 * @param in
 * @throws IOException
 */
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    //System.out.println( "> ----- countWords ------" );

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        //System.out.print( " " + s );
        words.add(s);
    }

    //System.out.println( "\n<" );

    /*overallCounts.addAll(words);*/
}

From source file:com.cloudera.knittingboar.utils.DatasetConverter.java

License:Apache License

private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {

    // use the provided analyzer to tokenize the input stream
    TokenStream ts = analyzer.tokenStream("text", in);
    ts.addAttribute(CharTermAttribute.class);

    // for each word in the stream, minus non-word stuff, add word to collection
    while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        // System.out.print( " " + s );
        words.add(s);/*from   w w w .j a v  a  2 s  . c  o m*/
    }

}