Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:edu.sdsc.scigraph.lucene.LuceneUtils.java

License:Apache License

public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
    List<String> ret = Lists.newArrayList();

    try {/*  w  ww  . j  a  v  a2 s. co  m*/
        TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
        CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(token.toString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }/*w w w  .j  av a 2s  . co  m*/
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java

License:Apache License

@Override
public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim)
        throws IOException {
    TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val);
    try {/*from w ww . j  a  v  a 2  s . co m*/
        ts.reset();
        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
        String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE;
        while (ts.incrementToken()) {
            if (matchType.equals(typeAtt.type())) {
                BytesRefBuilder ret = new BytesRefBuilder();
                ret.copyChars(termAtt.toString());
                if (!strict || appendExtraDelim) {
                    ret.append(delimBytes, 0, delimBytes.length);
                }
                return ret.get();
            }
        }
        return new BytesRef(BytesRef.EMPTY_BYTES);
    } finally {
        ts.close();
    }
}

From source file:edu.virginia.cs.utility.StringTokenizer.java

/**
 * Method that generates list of tokens from the parameter string.
 *
 * @param string/*  w  w w  .j  a va 2s . c om*/
 * @return list of tokens generated
 */
public List<String> TokenizeString(String string) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:elhuyar.bilakit.PayloadQParserPlugin.java

License:Open Source License

@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError {
    SchemaField sf = this.schema.getFieldOrNull(field);
    if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) {
        //analyze queryText
        List<String> result = new ArrayList<String>();
        try {/*from  w  ww.  jav a 2 s.co  m*/
            TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            //    not thrown b/c we're using a string reader...
            throw new RuntimeException(e);
        }
        String analyzedqueryText = "";
        analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " ");
        queryText = analyzedqueryText;
        // Note that this will work for any field defined with the
        //    <fieldType> of "*_payloads"
        Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true);

        return plter;

    }
    return super.getFieldQuery(field, queryText, quoted);
}

From source file:engine.easy.analyzer.EasySearchAnalyzer.java

License:Apache License

private static void printResult(String text, Analyzer analyzer) throws IOException {

    int tokenCount = 0;
    TokenStream tokenStream = analyzer.tokenStream("FIELDNAME", new StringReader(text)); // this method will used for token streams
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);
    while (tokenStream.incrementToken()) {
        tokenCount++;//from  ww  w  . j  a  v a  2  s  .com
        String tokenText = new String(termAtt.termBuffer(), 0, termAtt.termLength());
        System.out.println(" >> Token " + tokenCount + ": " + tokenText);
    }
}

From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java

License:Apache License

/**
 * Count the token stream tokens./*  www . j a  v  a  2  s  .  co m*/
 * 
 * @return it returns the no:of stream tokens.
  * @throws IOException if the file would have any IO operation.
 */
private static int[] countTokenStream(TokenStream tokenStream) throws IOException {
    int v[] = new int[2];
    HashSet countTokenStreamBuffer = new HashSet();
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);

    while (tokenStream.incrementToken()) {
        v[0]++;
        countTokenStreamBuffer.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
    }

    v[1] = countTokenStreamBuffer.size();
    tokenStream.reset();
    countTokenStreamBuffer.clear();
    return v;
}

From source file:filters.ComparisonDegreeFilter.java

License:Open Source License

/**
 * Constructor for class NegationScopeFilter
 * /*from   w w w .  j  av a 2  s  .c  om*/
 * @param input
 */
public ComparisonDegreeFilter(TokenStream input) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.getAttribute(TermAttribute.class);
    output_type = this.getAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = this.getAttribute(PayloadAttribute.class);

    // Default degree
    degree = ComparisonDegree.NONE;
}

From source file:filters.dependencies.NegationWordFilter.java

License:Open Source License

/**
 * Constructor for class NegationScopeFilter
 * @param input/*from  w  w w.  ja v a  2s  .c  om*/
 */
public NegationWordFilter(TokenStream input) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.getAttribute(TermAttribute.class);
    output_type = this.getAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = this.getAttribute(PayloadAttribute.class);

}

From source file:filters.indexing.IndexableFilter.java

License:Open Source License

/**
 * Constructor for class IndexableFilter
 * @param input/*from  w w  w .j  a  va2s. c o m*/
 */
public IndexableFilter(TokenStream input, boolean set_synset_terms) {
    super(input);

    // Getting attributes from input token stream
    input_term = input.getAttribute(TermAttribute.class);
    input_type = input.getAttribute(TypeAttribute.class);
    input_flags = input.getAttribute(FlagsAttribute.class);
    input_payload = input.getAttribute(PayloadAttribute.class);

    // Setting attributes for this token stream
    output_term = this.addAttribute(TermAttribute.class);
    output_type = this.addAttribute(TypeAttribute.class);
    output_flags = this.addAttribute(FlagsAttribute.class);
    output_payload = input.addAttribute(PayloadAttribute.class);

    this.set_synset_terms = set_synset_terms;
}