Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java

License:Apache License

/**
 * Low level api to get the most relevant (formatted) sections of the document.
 * This method has been made public to allow visibility of score information held in TextFragment objects.
 * Thanks to Jason Calabrese for help in redefining the interface.
 * @param tokenStream//from w  w  w  . j a va  2 s  .  c  o  m
 * @param text
 * @param maxNumFragments
 * @param mergeContiguousFragments
 * @throws IOException
 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 */
public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text,
        boolean mergeContiguousFragments, int maxNumFragments)
        throws IOException, InvalidTokenOffsetsException {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText = new StringBuilder();

    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    tokenStream.addAttribute(PositionIncrementAttribute.class);
    tokenStream.reset();

    TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size());

    if (fragmentScorer instanceof QueryScorer) {
        ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
    }

    TokenStream newStream = fragmentScorer.init(tokenStream);
    if (newStream != null) {
        tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);

    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);

    try {

        String tokenText;
        int startOffset;
        int endOffset;
        int lastEndOffset = 0;
        textFragmenter.start(text, tokenStream);

        TokenGroup tokenGroup = new TokenGroup(tokenStream);

        for (boolean next = tokenStream.incrementToken(); next
                && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) {
            if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) {
                throw new InvalidTokenOffsetsException("Token " + termAtt.toString()
                        + " exceeds length of provided text sized " + text.length());
            }
            if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) {
                //the current token is distinct from previous tokens -
                // markup the cached token group info
                startOffset = tokenGroup.matchStartOffset;
                endOffset = tokenGroup.matchEndOffset;
                tokenText = text.substring(startOffset, endOffset);
                String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
                //store any whitespace etc from between this and last group
                if (startOffset > lastEndOffset)
                    newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
                newText.append(markedUpText);
                lastEndOffset = Math.max(endOffset, lastEndOffset);
                tokenGroup.clear();

                //check if current token marks the start of a new fragment
                if (textFragmenter.isNewFragment()) {
                    currentFrag.setScore(fragmentScorer.getFragmentScore());
                    //record stats for a new fragment
                    currentFrag.textEndPos = newText.length();
                    currentFrag = new TextFragment(newText, newText.length(), docFrags.size());
                    fragmentScorer.startFragment(currentFrag);
                    docFrags.add(currentFrag);
                }
            }

            tokenGroup.addToken(fragmentScorer.getTokenScore());

            //            if(lastEndOffset>maxDocBytesToAnalyze)
            //            {
            //               break;
            //            }
        }
        currentFrag.setScore(fragmentScorer.getFragmentScore());

        if (tokenGroup.numTokens > 0) {
            //flush the accumulated text (same code as in above loop)
            startOffset = tokenGroup.matchStartOffset;
            endOffset = tokenGroup.matchEndOffset;
            tokenText = text.substring(startOffset, endOffset);
            String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
            //store any whitespace etc from between this and last group
            if (startOffset > lastEndOffset)
                newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
            newText.append(markedUpText);
            lastEndOffset = Math.max(lastEndOffset, endOffset);
        }

        //Test what remains of the original text beyond the point where we stopped analyzing 
        if (
        //               if there is text beyond the last token considered..
        (lastEndOffset < text.length()) &&
        //               and that text is not too large...
                (text.length() <= maxDocCharsToAnalyze)) {
            //append it to the last fragment
            newText.append(encoder.encodeText(text.substring(lastEndOffset)));
        }

        currentFrag.textEndPos = newText.length();

        //sort the most relevant sections of the text
        for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) {
            currentFrag = i.next();

            //If you are running with a version of Lucene before 11th Sept 03
            // you do not have PriorityQueue.insert() - so uncomment the code below
            /*
                   if (currentFrag.getScore() >= minScore)
                   {
                      fragQueue.put(currentFrag);
                      if (fragQueue.size() > maxNumFragments)
                      { // if hit queue overfull
                         fragQueue.pop(); // remove lowest in hit queue
                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
                      }
                    
                    
                   }
            */
            //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
            //fix to PriorityQueue. The correct method to use here is the new "insert" method
            // USE ABOVE CODE IF THIS DOES NOT COMPILE!
            fragQueue.insertWithOverflow(currentFrag);
        }

        //return the most relevant fragments
        TextFragment frag[] = new TextFragment[fragQueue.size()];
        for (int i = frag.length - 1; i >= 0; i--) {
            frag[i] = fragQueue.pop();
        }

        //merge any contiguous fragments to improve readability
        if (mergeContiguousFragments) {
            mergeContiguousFragments(frag);
            ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
            for (int i = 0; i < frag.length; i++) {
                if ((frag[i] != null) && (frag[i].getScore() > 0)) {
                    fragTexts.add(frag[i]);
                }
            }
            frag = fragTexts.toArray(new TextFragment[0]);
        }

        return frag;

    } finally {
        if (tokenStream != null) {
            try {
                tokenStream.end();
                tokenStream.close();
            } catch (Exception e) {
            }
        }
    }
}

From source file:edu.sdsc.scigraph.lucene.LuceneUtils.java

License:Apache License

public static List<String> getTokenization(Analyzer analyzer, CharSequence term) {
    List<String> ret = Lists.newArrayList();

    try {/* w  w  w.j av a2s .c o m*/
        TokenStream stream = analyzer.tokenStream("", new StringReader(term.toString()));
        CharTermAttribute token = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(token.toString());
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

License:Apache License

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }//  w  ww. j  a  v a 2s .  c  o  m
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java

License:Apache License

@Override
public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim)
        throws IOException {
    TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val);
    try {//from   w w w.  j av  a2 s.  co  m
        ts.reset();
        CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
        TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class);
        String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE;
        while (ts.incrementToken()) {
            if (matchType.equals(typeAtt.type())) {
                BytesRefBuilder ret = new BytesRefBuilder();
                ret.copyChars(termAtt.toString());
                if (!strict || appendExtraDelim) {
                    ret.append(delimBytes, 0, delimBytes.length);
                }
                return ret.get();
            }
        }
        return new BytesRef(BytesRef.EMPTY_BYTES);
    } finally {
        ts.close();
    }
}

From source file:edu.utsa.sifter.DocMaker.java

License:Apache License

public static boolean addBodyField(final Document doc, final String body, final Analyzer analyzer,
        boolean testEmpty) throws IOException {
    final Field f = new Field("body", body, BodyOptions);
    if (testEmpty) {
        // System.out.println("testing if doc has empty body");
        final TokenStream toks = f.tokenStream(analyzer);
        toks.reset();
        if (!toks.incrementToken()) {
            // System.out.println("empty body, won't index");
            toks.close();// w ww . j  ava2 s .co  m
            return false;
        }
    }
    doc.add(new Field("body", body, BodyOptions));
    doc.add(new LongField("body-len", body.length(), Field.Store.YES));
    return true;
}

From source file:edu.virginia.cs.utility.StringTokenizer.java

/**
 * Method that generates list of tokens from the parameter string.
 *
 * @param string/*from w ww  .java  2  s  .c  o m*/
 * @return list of tokens generated
 */
public List<String> TokenizeString(String string) {
    List<String> result = new ArrayList<>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:elhuyar.bilakit.PayloadQParserPlugin.java

License:Open Source License

@Override
protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError {
    SchemaField sf = this.schema.getFieldOrNull(field);
    if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) {
        //analyze queryText
        List<String> result = new ArrayList<String>();
        try {/*  w w w.  j  a  v a 2  s.c  om*/
            TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText));
            stream.reset();
            while (stream.incrementToken()) {
                result.add(stream.getAttribute(CharTermAttribute.class).toString());
            }
            stream.end();
            stream.close();
        } catch (IOException e) {
            //    not thrown b/c we're using a string reader...
            throw new RuntimeException(e);
        }
        String analyzedqueryText = "";
        analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " ");
        queryText = analyzedqueryText;
        // Note that this will work for any field defined with the
        //    <fieldType> of "*_payloads"
        Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true);

        return plter;

    }
    return super.getFieldQuery(field, queryText, quoted);
}

From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java

License:Apache License

/**
 * Count the token stream tokens.//from   w w  w  . j ava  2s  .  c om
 * 
 * @return it returns the no:of stream tokens.
  * @throws IOException if the file would have any IO operation.
 */
private static int[] countTokenStream(TokenStream tokenStream) throws IOException {
    int v[] = new int[2];
    HashSet countTokenStreamBuffer = new HashSet();
    TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class);

    while (tokenStream.incrementToken()) {
        v[0]++;
        countTokenStreamBuffer.add(new String(termAtt.termBuffer(), 0, termAtt.termLength()));
    }

    v[1] = countTokenStreamBuffer.size();
    tokenStream.reset();
    countTokenStreamBuffer.clear();
    return v;
}

From source file:fry.future.plugin.example.APP.java

private static List<String> tokenString(Analyzer analyzer, String str) throws IOException {
    List<String> result = new ArrayList<>();
    TokenStream tokenStream = analyzer.tokenStream("Test", new StringReader(str));
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        result.add(tokenStream.getAttribute(CharTermAttribute.class).toString());
    }/*from   w w  w. j a v a2 s. co  m*/
    return result;
}

From source file:gr.aueb.demo.PropertyRegistryBean.java

public static String removeStopWords(String textFile) {
    //CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    CharArraySet stopWords = PropertyRegistryBean.stopSet;
    TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(textFile.trim()));
    tokenStream = new StopFilter(Version.LUCENE_48, tokenStream, stopWords);
    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
    try {/*from   w  ww  .  jav  a  2  s . c  o  m*/
        tokenStream.reset();
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    try {
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            sb.append(term + " ");
        }
    } catch (IOException ex) {
        Logger.getLogger(PropertyRegistryBean.class.getName()).log(Level.SEVERE, null, ex);
    }
    return sb.toString();
}