Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenize(String data) {

    Set<String> transformedSet = new HashSet<String>(); //Set will make sure only unique terms are kept
    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from w  w w.j  av a2  s .com*/
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (stopwords.contains(term)) { //ignore stopwords
                //System.out.println("Contains stopword: "+ term);
                continue;
            }
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1) //ignore 1 letter words
                continue;

            if (!digitPattern.matcher(term).find()) { //ignore digits
                stemmer.setCurrent(term);
                stemmer.stem();
                transformedSet.add(stemmer.getCurrent());
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("transormed set size in tokenizeAndStem: "+ transformedSet.size());
    for (Object token : transformedSet.toArray()) {
        strBuilder.append(token).append(" ");
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessRemoveStopWords(String data) {

    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from w  w w.  ja  v  a2 s  .  c  o  m*/
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1)
                continue;
            if (stopwords.contains(term))
                continue;
            strBuilder.append(term).append(" ");
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString().trim();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static Set<String> preprocessStemAndTokenizeAddBigramsInSet(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    //System.out.println("Stop words length:" + stopwords.size());
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//from   ww w  . j a va  2s. co  m
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(" ")
    stemmedList.clear();
    stemmedList = null;
    ds = null;
    return transformedSet;
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeReturnDistinctTokens(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize ..");
    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//w  ww .  j ava  2  s  .  co m
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());
        }
        transformedSet.addAll(stemmedList);
    } catch (Exception e) {
        e.printStackTrace();
    }
    stemmedList.clear();
    stemmedList = null;

    return StringUtils.join(transformedSet.toArray(), " ");
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeAddBigramsInString(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from  ww  w .  j a  va2s  .c o m*/
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore stopwords
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(transformedSet.toArray(new String[transformedSet.size()]).toString());
    return StringUtils.join(transformedSet.toArray(new String[transformedSet.size()]), " ");

}

From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java

License:Apache License

/**
 * Low level api to get the most relevant (formatted) sections of the document.
 * This method has been made public to allow visibility of score information held in TextFragment objects.
 * Thanks to Jason Calabrese for help in redefining the interface.
 * @param tokenStream/*  www  .  j  av a 2s. c  om*/
 * @param text
 * @param maxNumFragments
 * @param mergeContiguousFragments
 * @throws IOException
 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText = new StringBuilder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text, tokenStream);

        TokenGroup tokenGroup = new TokenGroup(tokenStream);

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
            if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
                        + " exceeds length of provided text sized " + text.length());
            }
            if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) {
                //the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.matchStartOffset;
                endOffset = tokenGroup.matchEndOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(endOffset, lastEndOffset);
                tokenGroup.clear();

                //check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment()) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());
                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }

            tokenGroup.addToken(fragmentScorer.getTokenScore());

            //            if(lastEndOffset>maxDocBytesToAnalyze)
            //            {
            //               break;
            //            }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());

        if (tokenGroup.numTokens > 0) {
            //flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.matchStartOffset;
            endOffset = tokenGroup.matchEndOffset;
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
            //store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset)
                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
            newText.append(markedUpText);
            lastEndOffset = Math.max(lastEndOffset, endOffset);
        }

        //Test what remains of the original text beyond the point where we stopped analyzing 
        if (
        //               if there is text beyond the last token considered..
        (lastEndOffset < text.length()) &&
        //               and that text is not too large...
                (text.length() <= maxDocCharsToAnalyze)) {
            //append it to the last fragment
            newText.append(encoder.encodeText(text.substring(lastEndOffset)));
        }

        currentFrag.textEndPos = newText.length();

        //sort the most relevant sections of the text
        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) {
            currentFrag = i.next();

            //If you are running with a version of Lucene before 11th Sept 03
            // you do not have PriorityQueue.insert() - so uncomment the code below
            /*
                   if (currentFrag.getScore() >= minScore)
                   {
                      fragQueue.put(currentFrag);
                      if (fragQueue.size() > maxNumFragments)
                      { // if hit queue overfull
                         fragQueue.pop(); // remove lowest in hit queue
                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                      }
                    
                    
                   }
            */
            //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
            //fix to PriorityQueue. The correct method to use here is the new "insert" method
            // USE ABOVE CODE IF THIS DOES NOT COMPILE!
            fragQueue.insertWithOverflow(currentFrag);
        }

        //return the most relevant fragments
        TextFragment frag[] = new TextFragment[fragQueue.size()];
        for (int i = frag.length - 1; i >= 0; i--) {
            frag[i] = fragQueue.pop();
        }

        //merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(frag);
            ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
            for (int i = 0; i < frag.length; i++) {
                if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                    fragTexts.add(frag[i]);
                }
            }
            frag = fragTexts.toArray(new TextFragment[0]);
        }

        return frag;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:edu.sdsc.scigraph.annotation.ShingleProducer.java

License:Apache License

@Override
public void run() {
    Deque<Token<String>> buffer = new LinkedList<>();
    try {/*from  w  ww  . j  ava2s  .com*/
        TokenStream stream = analyzer.tokenStream("", reader);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);

        try {
            while (stream.incrementToken()) {
                Token<String> token = new Token<String>(term.toString(), offset.startOffset(),
                        offset.endOffset());
                buffer.offer(token);
                if (buffer.size() < shingleCount) {
                    // Fill the buffer first, before offering anything to the queue
                    continue;
                }
                addBufferToQueue(buffer);
                if (shingleCount == buffer.size()) {
                    buffer.pop();
                }
            }
        } catch (IOException e) {
            logger.log(Level.WARNING, "Failed to produces singles", e);
        }
        while (!buffer.isEmpty()) {
            addBufferToQueue(buffer);
            buffer.pop();
        }
        queue.put(END_TOKEN);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
    }
}

From source file:edu.sdsc.scigraph.lucene.LuceneUtils.java

License:Apache License

public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
    List<String> ret = Lists.newArrayList();

    try {//from ww w  .  j av a2 s .  c  om
        TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
        CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(token.toString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:edu.stanford.lucene.analysis.TestCJKFoldingFilter.java

License:Open Source License

void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt) throws Exception {
    assertTrue(stream.incrementToken());
    assertEquals(expected, termAtt.toString());
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }/*from   w ww . j  a v a  2s  .c om*/
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}