Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenize(String data) {

    Set<String> transformedSet = new HashSet<String>(); //Set will make sure only unique terms are kept
    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from w  w w.j  av a2  s .com*/
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (stopwords.contains(term)) { //ignore stopwords
                //System.out.println("Contains stopword: "+ term);
                continue;
            }
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1) //ignore 1 letter words
                continue;

            if (!digitPattern.matcher(term).find()) { //ignore digits
                stemmer.setCurrent(term);
                stemmer.stem();
                transformedSet.add(stemmer.getCurrent());
            }
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("transormed set size in tokenizeAndStem: "+ transformedSet.size());
    for (Object token : transformedSet.toArray()) {
        strBuilder.append(token).append(" ");
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessRemoveStopWords(String data) {

    StringBuilder strBuilder = new StringBuilder();
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from w  w w.  ja  v  a2 s  .  c  o  m*/
    String term;
    //System.out.println("The value of data in tokenizeAndStem: "+ data);
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (term.length() <= 1)
                continue;
            if (stopwords.contains(term))
                continue;
            strBuilder.append(term).append(" ");
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    //System.out.println("String returned in tokenizeAndStem:"+ strBuilder.toString());
    return strBuilder.toString().trim();
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static Set<String> preprocessStemAndTokenizeAddBigramsInSet(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    //System.out.println("Stop words length:" + stopwords.size());
    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//from   ww w  . j a va  2s. co  m
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(" ")
    stemmedList.clear();
    stemmedList = null;
    ds = null;
    return transformedSet;
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeReturnDistinctTokens(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize ..");
    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;//w  ww .  j ava  2  s  .  co m
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore single letter words
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());
        }
        transformedSet.addAll(stemmedList);
    } catch (Exception e) {
        e.printStackTrace();
    }
    stemmedList.clear();
    stemmedList = null;

    return StringUtils.join(transformedSet.toArray(), " ");
}

From source file:edu.isi.pfindr.learn.util.CleanDataUtil.java

License:Apache License

public static String preprocessStemAndTokenizeAddBigramsInString(String data) {
    //System.out.println("Preprocess data, remove stop words, stem, tokenize and get bi-grams ..");

    Set<String> transformedSet = new LinkedHashSet<String>();
    List<String> stemmedList = new ArrayList<String>();

    Tokenizer analyzer = new Tokenizer(Version.LUCENE_30);
    TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data));
    TermAttribute termAttribute;/*from  ww  w .  j a  va2s  .c o m*/
    String term;
    try {
        while (tokenStream.incrementToken()) {
            termAttribute = tokenStream.getAttribute(TermAttribute.class);
            term = termAttribute.term();
            if (digitPattern.matcher(term).find()) //ignore digits
                continue;
            if (stopwords.contains(term)) //ignore stopwords
                continue;
            if (term.length() <= 1) //ignore stopwords
                continue;
            stemmer.setCurrent(term);
            stemmer.stem();
            stemmedList.add(stemmer.getCurrent());

        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    String[] ds = stemmedList.toArray(new String[0]);

    /*for(int i=0; i<stemmedList.size(); i++)
       System.out.print(ds[i]+"\t");*/

    //add bi-grams
    final int size = 2;
    for (int i = 0; i < ds.length; i++) {
        transformedSet.add(ds[i]); //add single words
        if (i + size <= ds.length) {
            String t = "";
            for (int j = i; j < i + size; j++) {
                t += " " + ds[j];
            }
            t = t.trim().replaceAll("\\s+", "_");
            transformedSet.add(t); //add bi-gram combined with "_"
        }
    }
    //System.out.println(transformedSet.toArray(new String[transformedSet.size()]).toString());
    return StringUtils.join(transformedSet.toArray(new String[transformedSet.size()]), " ");

}

From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java

License:Apache License

/**
 * Low level api to get the most relevant (formatted) sections of the document.
 * This method has been made public to allow visibility of score information held in TextFragment objects.
 * Thanks to Jason Calabrese for help in redefining the interface.
 * @param tokenStream/*  www  .  j  av a 2s. c  om*/
 * @param text
 * @param maxNumFragments
 * @param mergeContiguousFragments
 * @throws IOException
 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText = new StringBuilder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text, tokenStream);

        TokenGroup tokenGroup = new TokenGroup(tokenStream);

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
            if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
                        + " exceeds length of provided text sized " + text.length());
            }
            if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) {
                //the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.matchStartOffset;
                endOffset = tokenGroup.matchEndOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(endOffset, lastEndOffset);
                tokenGroup.clear();

                //check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment()) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());
                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }

            tokenGroup.addToken(fragmentScorer.getTokenScore());

            //            if(lastEndOffset>maxDocBytesToAnalyze)
            //            {
            //               break;
            //            }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());

        if (tokenGroup.numTokens > 0) {
            //flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.matchStartOffset;
            endOffset = tokenGroup.matchEndOffset;
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
            //store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset)
                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
            newText.append(markedUpText);
            lastEndOffset = Math.max(lastEndOffset, endOffset);
        }

        //Test what remains of the original text beyond the point where we stopped analyzing 
        if (
        //               if there is text beyond the last token considered..
        (lastEndOffset < text.length()) &&
        //               and that text is not too large...
                (text.length() <= maxDocCharsToAnalyze)) {
            //append it to the last fragment
            newText.append(encoder.encodeText(text.substring(lastEndOffset)));
        }

        currentFrag.textEndPos = newText.length();

        //sort the most relevant sections of the text
        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) {
            currentFrag = i.next();

            //If you are running with a version of Lucene before 11th Sept 03
            // you do not have PriorityQueue.insert() - so uncomment the code below
            /*
                   if (currentFrag.getScore() >= minScore)
                   {
                      fragQueue.put(currentFrag);
                      if (fragQueue.size() > maxNumFragments)
                      { // if hit queue overfull
                         fragQueue.pop(); // remove lowest in hit queue
                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                      }
                    
                    
                   }
            */
            //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
            //fix to PriorityQueue. The correct method to use here is the new "insert" method
            // USE ABOVE CODE IF THIS DOES NOT COMPILE!
            fragQueue.insertWithOverflow(currentFrag);
        }

        //return the most relevant fragments
        TextFragment frag[] = new TextFragment[fragQueue.size()];
        for (int i = frag.length - 1; i >= 0; i--) {
            frag[i] = fragQueue.pop();
        }

        //merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(frag);
            ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
            for (int i = 0; i < frag.length; i++) {
                if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                    fragTexts.add(frag[i]);
                }
            }
            frag = fragTexts.toArray(new TextFragment[0]);
        }

        return frag;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:edu.sdsc.scigraph.annotation.ShingleProducer.java

License:Apache License

@Override
public void run() {
    Deque<Token<String>> buffer = new LinkedList<>();
    try {/*from  w  ww  . j  ava2s  .com*/
        TokenStream stream = analyzer.tokenStream("", reader);
        OffsetAttribute offset = stream.getAttribute(OffsetAttribute.class);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);

        try {
            while (stream.incrementToken()) {
                Token<String> token = new Token<String>(term.toString(), offset.startOffset(),
                        offset.endOffset());
                buffer.offer(token);
                if (buffer.size() < shingleCount) {
                    // Fill the buffer first, before offering anything to the queue
                    continue;
                }
                addBufferToQueue(buffer);
                if (shingleCount == buffer.size()) {
                    buffer.pop();
                }
            }
        } catch (IOException e) {
            logger.log(Level.WARNING, "Failed to produces singles", e);
        }
        while (!buffer.isEmpty()) {
            addBufferToQueue(buffer);
            buffer.pop();
        }
        queue.put(END_TOKEN);
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
    }
}

From source file:edu.sdsc.scigraph.lucene.LuceneUtils.java

License:Apache License

public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
    List<String> ret = Lists.newArrayList();

    try {//from ww w  .  j av a2 s .  c  om
        TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
        CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(token.toString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:edu.stanford.lucene.analysis.TestCJKFoldingFilter.java

License:Open Source License

void assertTermEquals(String expected, TokenStream stream, CharTermAttribute termAtt) throws Exception {
    assertTrue(stream.incrementToken());
    assertEquals(expected, termAtt.toString());
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }/*from   w ww . j  a v a  2s  .c om*/
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}