Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:edu.sdsc.scigraph.lucene.LuceneUtils.java

License:Apache License

public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
    List<String> ret = Lists.newArrayList();

    try {/*  w  ww  . j  a  v  a2 s. co  m*/
        TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
        CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(token.toString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }/*w w w  .j  av a 2s  . co  m*/
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java

License:Apache License

@Override
public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim)
        throws IOException {
    TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val);
    try {/*from w ww . j  a  v  a 2  s . co m*/
        ts.reset();
        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
        String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE;
        while (ts.incrementToken()) {
            if (matchType.equals(typeAtt.type())) {
                BytesRefBuilder ret = new BytesRefBuilder();
                ret.copyChars(termAtt.toString());
                if (!strict || appendExtraDelim) {
                    ret.append(delimBytes, 0, delimBytes.length);
                }
                return ret.get();
            }
        }
        return new BytesRef(BytesRef.EMPTY_BYTES);
    } finally {
        ts.close();
    }
}

From source file:edu.virginia.cs.utility.StringTokenizer.java

/**
 * Method that generates list of tokens from the parameter string.
 *
 * @param string/*  w  w w  .j  a va 2s . c om*/
 * @return list of tokens generated
 */
public List<String> TokenizeString(String string) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:elhuyar.bilakit.PayloadQParserPlugin.java

License:Open Source License

@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError {
    SchemaField sf = this.schema.getFieldOrNull(field);
    if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) {
        //analyze queryText
        List<String> result = new ArrayList<String>();
        try {/*from  w  ww.  jav a 2 s.co  m*/
            TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            //    not thrown b/c we're using a string reader...
            throw new RuntimeException(e);
        }
        String analyzedqueryText = "";
        analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " ");
        queryText = analyzedqueryText;
        // Note that this will work for any field defined with the
        //    <fieldType> of "*_payloads"
        Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true);

        return plter;

    }
    return super.getFieldQuery(field, queryText, quoted);
}

From source file:engine.easy.analyzer.EasySearchAnalyzer.java

License:Apache License

private static void printResult(String text, Analyzer analyzer) throws IOException {

    int tokenCount = 0;
    TokenStream tokenStream = analyzer.tokenStream("FIELDNAME", new StringReader(text)); // this method will used for token streams
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);
    while (tokenStream.incrementToken()) {
        tokenCount++;//from  ww  w  . j  a  v a  2  s  .com
        String tokenText = new String(termAtt.termBuffer(), 0, termAtt.termLength());
        System.out.println(" >> Token " + tokenCount + ": " + tokenText);
    }
}

From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java

License:Apache License

/**
 * Count the token stream tokens./*  www . j a  v  a  2  s  .  co m*/
 * 
 * @return it returns the no:of stream tokens.
  * @throws IOException if the file would have any IO operation.
 */
private static int[] countTokenStream(TokenStream tokenStream) throws IOException {
    int v[] = new int[2];
    HashSet countTokenStreamBuffer = new HashSet();
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);

    while (tokenStream.incrementToken()) {
        v[0]++;
        countTokenStreamBuffer.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
    }

    v[1] = countTokenStreamBuffer.size();
    tokenStream.reset();
    countTokenStreamBuffer.clear();
    return v;
}

From source file:filters.ComparisonDegreeFilter.java

License:Open Source License

/**
 * Constructor for class NegationScopeFilter
 * /*from   w w w .  j  av a 2  s  .c  om*/
 * @param input
 */
public ComparisonDegreeFilter(TokenStream input) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.getAttribute(TermAttribute.class);
    output_type = this.getAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = this.getAttribute(PayloadAttribute.class);

    // Default degree
    degree = ComparisonDegree.NONE;
}

From source file:filters.dependencies.NegationWordFilter.java

License:Open Source License

/**
 * Constructor for class NegationScopeFilter
 * @param input/*from  w  w w.  ja v a  2s  .c  om*/
 */
public NegationWordFilter(TokenStream input) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.getAttribute(TermAttribute.class);
    output_type = this.getAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = this.getAttribute(PayloadAttribute.class);

}

From source file:filters.indexing.IndexableFilter.java

License:Open Source License

/**
 * Constructor for class IndexableFilter
 * @param input/*from  w w  w .j  a  va2s. c o m*/
 */
public IndexableFilter(TokenStream input, boolean set_synset_terms) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.addAttribute(TermAttribute.class);
    output_type = this.addAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = input.addAttribute(PayloadAttribute.class);

    this.set_synset_terms = set_synset_terms;
}