Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java

License:Apache License

public static String tokenizeString(Analyzer analyzer, String string) {
    // Inspired by stackoverflow:
    // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string
    StringBuilder builder = new StringBuilder();
    try {/*from  ww w  .  j  a va  2  s  .co  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            builder.append(stream.getAttribute(CharTermAttribute.class).toString());
            builder.append(" ");
        }
        stream.close();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return builder.toString().trim();
}

From source file:dependencies.ReviewDependencyAnalyzer.java

License:Open Source License

public ArrayList<ArrayList<Token>> getSentences(Reader reader) {

    try {// ww  w  . j av a  2 s  . c  om
        // Send reader data through the analyzer
        TokenStream tokstr = reusableTokenStream("", reader);
        TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class);
        TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class);
        FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class);
        PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class);

        // Split the tokenstream returned by the analyzer into sentences. Convert each sentence
        // into a linked list of tokens
        ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>();
        ArrayList<Token> current_sentence = new ArrayList<Token>();

        while (tokstr.incrementToken()) {
            Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(),
                    new ReviewTermPayload(tok_payload.getPayload()));
            current_sentence.add(current_token);

            // End of sentence reached. Add current sentence to the sentence list
            if (current_token.isDelim(true)) {
                if (current_sentence.size() > 1) {
                    sentence_list.add(current_sentence);
                }
                current_sentence = new ArrayList<Token>();
            }
        }

        // At the end of the token stream, if there is an incomplete sentence, add it to the
        // sentence list.
        // This case could occur when the last sentence of a given passage does not end with a
        // period or other sentence delimiter.
        if (!current_sentence.isEmpty()) {
            sentence_list.add(current_sentence);
        }

        return sentence_list;
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE,
                "Error reading data from reader. Analyzing text for typed dependencies could not be completed");
        return null;
    }
}

From source file:di.uniba.it.wsd.RevisedLesk.java

License:Open Source License

/**
 *
 * @param text// w w w. j a  va  2  s.c om
 * @return
 * @throws IOException
 */
public Map<String, Float> buildBag(String text) throws IOException {
    Map<String, Float> bag = new HashMap<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    SnowballStemmer stemmer = null;
    if (stemming) {
        stemmer = getStemmer(language);
        if (stemmer == null) {
            Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}",
                    language);
        }
    }
    TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text));
    while (tokenStream.incrementToken()) {
        TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        String term = token.term();
        if (stemmer != null) {
            stemmer.setCurrent(term);
            if (stemmer.stem()) {
                term = stemmer.getCurrent();
            }
        }
        Float c = bag.get(term);
        if (c == null) {
            bag.put(term, 1f);
        } else {
            bag.put(term, c + 1f);
        }
    }
    return bag;
}

From source file:Document.DocumentProcessor.java

public final void processDocument(Document doc) {
    try {// w  w  w . j a  v a2 s.c  o  m
        CharArraySet ch = new CharArraySet(Version.LUCENE_48, stopWords, true);
        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(doc.getContent()));
        tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, ch);
        tokenStream = new PorterStemFilter(tokenStream);
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        Set<String> uniqueWords = new HashSet<>();
        Map<String, Integer> wordFrequency = new HashMap<String, Integer>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String word = charTermAttribute.toString();
            uniqueWords.add(word);
            if (wordFrequency.containsKey(word))
                wordFrequency.put(word, wordFrequency.get(word) + 1);
            else
                wordFrequency.put(word, 1);
            dictionary.add(word);

        }
        doc.setUniqueWords(uniqueWords);
        doc.setWordFrequency(wordFrequency);

    } catch (IOException ex) {
        Logger.getLogger(DocumentProcessor.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenize(String data) {

    Set<String> transformedSet = new HashSet<String>(); //Set will make sure only unique terms are kept
    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//  ww  w.ja v  a2s  .c  o  m
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (stopwords.contains(term)) { //ignore stopwords
                //System.out.println("Contains stopword: "+ term);
                continue;
            }
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1) //ignore 1 letter words
                continue;

            if (!digitPattern.matcher(term).find()) { //ignore digits
                stemmer.setCurrent(term);
                stemmer.stem();
                transformedSet.add(stemmer.getCurrent());
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("transormed set size in tokenizeAndStem: "+ transformedSet.size());
    for (Object token : transformedSet.toArray()) {
        strBuilder.append(token).append(" ");
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessRemoveStopWords(String data) {

    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//from w  ww  . j a v a  2s.  com
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1)
                continue;
            if (stopwords.contains(term))
                continue;
            strBuilder.append(term).append(" ");
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString().trim();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static Set<String> preprocessStemAndTokenizeAddBigramsInSet(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    //System.out.println("Stop words length:" + stopwords.size());
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;// w  w  w .  j  av  a  2 s  .  c  om
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(" ")
    stemmedList.clear();
    stemmedList = null;
    ds = null;
    return transformedSet;
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeReturnDistinctTokens(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize ..");
    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from  w  w  w .  jav a  2s. co  m*/
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());
        }
        transformedSet.addAll(stemmedList);
    } catch (Exception e) {
        e.printStackTrace();
    }
    stemmedList.clear();
    stemmedList = null;

    return StringUtils.join(transformedSet.toArray(), " ");
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeAddBigramsInString(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//from www .  jav  a2  s.com
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore stopwords
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(transformedSet.toArray(new String[transformedSet.size()]).toString());
    return StringUtils.join(transformedSet.toArray(new String[transformedSet.size()]), " ");

}

From source file:edu.sdsc.scigraph.annotation.ShingleProducer.java

License:Apache License

@Override
public void run() {
    Deque<Token<String>> buffer = new LinkedList<>();
    try {/* w  ww.ja  va  2  s. c  om*/
        TokenStream stream = analyzer.tokenStream("", reader);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);

        try {
            while (stream.incrementToken()) {
                Token<String> token = new Token<String>(term.toString(), offset.startOffset(),
                        offset.endOffset());
                buffer.offer(token);
                if (buffer.size() < shingleCount) {
                    // Fill the buffer first, before offering anything to the queue
                    continue;
                }
                addBufferToQueue(buffer);
                if (shingleCount == buffer.size()) {
                    buffer.pop();
                }
            }
        } catch (IOException e) {
            logger.log(Level.WARNING, "Failed to produces singles", e);
        }
        while (!buffer.isEmpty()) {
            addBufferToQueue(buffer);
            buffer.pop();
        }
        queue.put(END_TOKEN);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
    }
}