Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:de.uni_koeln.spinfo.maalr.lucene.util.TokenizerHelper.java

License:Apache License

public static String tokenizeString(Analyzer analyzer, String string) {
    // Inspired by stackoverflow:
    // http://stackoverflow.com/questions/6334692/how-to-use-a-lucene-analyzer-to-tokenize-a-string
    StringBuilder builder = new StringBuilder();
    try {/*from  ww w  .  j  a va  2  s  .co  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            builder.append(stream.getAttribute(CharTermAttribute.class).toString());
            builder.append(" ");
        }
        stream.close();
    } catch (IOException e) {
        // not thrown b/c we're using a string reader...
        throw new RuntimeException(e);
    }
    return builder.toString().trim();
}

From source file:dependencies.ReviewDependencyAnalyzer.java

License:Open Source License

public ArrayList<ArrayList<Token>> getSentences(Reader reader) {

    try {// ww  w  . j av a  2 s  . c  om
        // Send reader data through the analyzer
        TokenStream tokstr = reusableTokenStream("", reader);
        TermAttribute tok_term = tokstr.getAttribute(TermAttribute.class);
        TypeAttribute tok_type = tokstr.getAttribute(TypeAttribute.class);
        FlagsAttribute tok_flags = tokstr.getAttribute(FlagsAttribute.class);
        PayloadAttribute tok_payload = tokstr.getAttribute(PayloadAttribute.class);

        // Split the tokenstream returned by the analyzer into sentences. Convert each sentence
        // into a linked list of tokens
        ArrayList<ArrayList<Token>> sentence_list = new ArrayList<ArrayList<Token>>();
        ArrayList<Token> current_sentence = new ArrayList<Token>();

        while (tokstr.incrementToken()) {
            Token current_token = new Token(tok_term.term(), tok_type.type(), tok_flags.getFlags(),
                    new ReviewTermPayload(tok_payload.getPayload()));
            current_sentence.add(current_token);

            // End of sentence reached. Add current sentence to the sentence list
            if (current_token.isDelim(true)) {
                if (current_sentence.size() > 1) {
                    sentence_list.add(current_sentence);
                }
                current_sentence = new ArrayList<Token>();
            }
        }

        // At the end of the token stream, if there is an incomplete sentence, add it to the
        // sentence list.
        // This case could occur when the last sentence of a given passage does not end with a
        // period or other sentence delimiter.
        if (!current_sentence.isEmpty()) {
            sentence_list.add(current_sentence);
        }

        return sentence_list;
    } catch (IOException e) {
        AppLogger.error.log(Level.SEVERE,
                "Error reading data from reader. Analyzing text for typed dependencies could not be completed");
        return null;
    }
}

From source file:di.uniba.it.wsd.RevisedLesk.java

License:Open Source License

/**
 *
 * @param text// w w w. j a  va  2  s.c om
 * @return
 * @throws IOException
 */
public Map<String, Float> buildBag(String text) throws IOException {
    Map<String, Float> bag = new HashMap<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
    SnowballStemmer stemmer = null;
    if (stemming) {
        stemmer = getStemmer(language);
        if (stemmer == null) {
            Logger.getLogger(RevisedLesk.class.getName()).log(Level.WARNING, "No stemmer for language {0}",
                    language);
        }
    }
    TokenStream tokenStream = analyzer.tokenStream("gloss", new StringReader(text));
    while (tokenStream.incrementToken()) {
        TermAttribute token = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        String term = token.term();
        if (stemmer != null) {
            stemmer.setCurrent(term);
            if (stemmer.stem()) {
                term = stemmer.getCurrent();
            }
        }
        Float c = bag.get(term);
        if (c == null) {
            bag.put(term, 1f);
        } else {
            bag.put(term, c + 1f);
        }
    }
    return bag;
}

From source file:Document.DocumentProcessor.java

public final void processDocument(Document doc) {
    try {// w  w  w . j a  v a2 s.c  o  m
        CharArraySet ch = new CharArraySet(Version.LUCENE_48, stopWords, true);
        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(doc.getContent()));
        tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, ch);
        tokenStream = new PorterStemFilter(tokenStream);
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        Set<String> uniqueWords = new HashSet<>();
        Map<String, Integer> wordFrequency = new HashMap<String, Integer>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String word = charTermAttribute.toString();
            uniqueWords.add(word);
            if (wordFrequency.containsKey(word))
                wordFrequency.put(word, wordFrequency.get(word) + 1);
            else
                wordFrequency.put(word, 1);
            dictionary.add(word);

        }
        doc.setUniqueWords(uniqueWords);
        doc.setWordFrequency(wordFrequency);

    } catch (IOException ex) {
        Logger.getLogger(DocumentProcessor.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenize(String data) {

    Set<String> transformedSet = new HashSet<String>(); //Set will make sure only unique terms are kept
    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//  ww  w.ja v  a2s  .c  o  m
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (stopwords.contains(term)) { //ignore stopwords
                //System.out.println("Contains stopword: "+ term);
                continue;
            }
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1) //ignore 1 letter words
                continue;

            if (!digitPattern.matcher(term).find()) { //ignore digits
                stemmer.setCurrent(term);
                stemmer.stem();
                transformedSet.add(stemmer.getCurrent());
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("transormed set size in tokenizeAndStem: "+ transformedSet.size());
    for (Object token : transformedSet.toArray()) {
        strBuilder.append(token).append(" ");
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessRemoveStopWords(String data) {

    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//from w  ww  . j a v a  2s.  com
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1)
                continue;
            if (stopwords.contains(term))
                continue;
            strBuilder.append(term).append(" ");
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString().trim();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static Set<String> preprocessStemAndTokenizeAddBigramsInSet(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    //System.out.println("Stop words length:" + stopwords.size());
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;// w  w  w .  j  av  a  2 s  .  c  om
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(" ")
    stemmedList.clear();
    stemmedList = null;
    ds = null;
    return transformedSet;
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeReturnDistinctTokens(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize ..");
    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from  w  w  w .  jav a  2s. co  m*/
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());
        }
        transformedSet.addAll(stemmedList);
    } catch (Exception e) {
        e.printStackTrace();
    }
    stemmedList.clear();
    stemmedList = null;

    return StringUtils.join(transformedSet.toArray(), " ");
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeAddBigramsInString(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//from www .  jav  a2  s.com
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore stopwords
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(transformedSet.toArray(new String[transformedSet.size()]).toString());
    return StringUtils.join(transformedSet.toArray(new String[transformedSet.size()]), " ");

}

From source file:edu.sdsc.scigraph.annotation.ShingleProducer.java

License:Apache License

@Override
public void run() {
    Deque<Token<String>> buffer = new LinkedList<>();
    try {/* w  ww.ja  va  2  s. c  om*/
        TokenStream stream = analyzer.tokenStream("", reader);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);

        try {
            while (stream.incrementToken()) {
                Token<String> token = new Token<String>(term.toString(), offset.startOffset(),
                        offset.endOffset());
                buffer.offer(token);
                if (buffer.size() < shingleCount) {
                    // Fill the buffer first, before offering anything to the queue
                    continue;
                }
                addBufferToQueue(buffer);
                if (shingleCount == buffer.size()) {
                    buffer.pop();
                }
            }
        } catch (IOException e) {
            logger.log(Level.WARNING, "Failed to produces singles", e);
        }
        while (!buffer.isEmpty()) {
            addBufferToQueue(buffer);
            buffer.pop();
        }
        queue.put(END_TOKEN);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
    }
}