Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:di.uniba.it.tri.occ.BuildOccurrence.java

License:Open Source License

private List<String> getTokens(Reader reader) throws IOException {
    List<String> tokens = new ArrayList<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
    TokenStream tokenStream = analyzer.tokenStream("text", reader);
    tokenStream.reset();
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
        String token = cattr.toString();
        String[] split = token.split("'");
        if (split.length == 1) {
            tokens.add(token);//from www .  j a  va 2s. c  o m
        } else {
            int max = 0;
            int index = 0;
            for (int i = 0; i < split.length; i++) {
                if (split[i].length() > max) {
                    max = split[i].length();
                    index = i;
                }
            }
            tokens.add(split[index]);
        }
    }
    tokenStream.end();
    return tokens;
}

From source file:doc2vec.LuceneDocIterator.java

String preProcess(Analyzer analyzer, String text) throws Exception {

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();/*from   w  w w.java 2 s .  co  m*/

        if (labelsStoredWithWords) {
            term = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM)[0]; // the first part is the word
        }

        if (!term.trim().equals(""))
            tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();
    return tokenizedContentBuff.toString();
}

From source file:Document.DocumentProcessor.java

public final void processDocument(Document doc) {
    try {/*from   w w  w.j  a  v  a  2 s .co  m*/
        CharArraySet ch = new CharArraySet(Version.LUCENE_48, stopWords, true);
        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_48, new StringReader(doc.getContent()));
        tokenStream = new StopFilter(Version.LUCENE_36, tokenStream, ch);
        tokenStream = new PorterStemFilter(tokenStream);
        CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        Set<String> uniqueWords = new HashSet<>();
        Map<String, Integer> wordFrequency = new HashMap<String, Integer>();
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String word = charTermAttribute.toString();
            uniqueWords.add(word);
            if (wordFrequency.containsKey(word))
                wordFrequency.put(word, wordFrequency.get(word) + 1);
            else
                wordFrequency.put(word, 1);
            dictionary.add(word);

        }
        doc.setUniqueWords(uniqueWords);
        doc.setWordFrequency(wordFrequency);

    } catch (IOException ex) {
        Logger.getLogger(DocumentProcessor.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:drakkar.mast.retrieval.analysis.NGramQuery.java

/**
 *
 * @param analyzer//from w  w w .j a va2s  .  com
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer analyzer, String queryTerm, String field) throws IOException {
    String words[] = null;

    //remove white spaces
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }

    //one term
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        //more than one term
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = analyzer.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:drakkar.mast.retrieval.ngram.NGramQuery.java

/**
 *
 * @param a/* w  w w.  java  2s.c  o  m*/
 * @param queryTerm
 * @param field
 * @throws IOException
 */
public NGramQuery(Analyzer a, String queryTerm, String field) throws IOException {
    String words[] = null;
    if (queryTerm.contains(" ")) {
        words = queryTerm.split(" ");
    } else {
        words = new String[1];
        words[0] = queryTerm;
    }
    if (words.length > 1) {
        for (int i = 0; i < words.length; i++) {
            String string = words[i];
            Term t = new Term(field, string);
            TermQuery pquery = new TermQuery(t);
            add(pquery, org.apache.lucene.search.BooleanClause.Occur.SHOULD);
        }

    } else {
        for (int i = 0; i < words.length; i++) {
            String wordToAnalyze = words[i];
            TokenStream tokens = a.tokenStream(field, new StringReader(wordToAnalyze));
            TermAttribute termAtt = (TermAttribute) tokens.addAttribute(TermAttribute.class);
            tokens.reset();
            TermQuery pquery;
            for (; tokens.incrementToken(); add(
                    new BooleanClause(pquery, org.apache.lucene.search.BooleanClause.Occur.MUST))) {
                Term t = new Term(field, termAtt.term());
                pquery = new TermQuery(t);
            }

            tokens.end();
            tokens.close();
        }

    }
}

From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.TFIDF.java

License:Open Source License

protected void computeTFIDF(List<TFIDFTerm> wordList, int totalWordsDoc) {
    if (reader != null && searcher != null) {
        double tf;
        double idf;
        double tfidf;
        EnglishAnalyzer analyzer = new EnglishAnalyzer(Version.LUCENE_40);
        TokenStream stream = null;
        CharTermAttribute termAtt;//w  w w. j av  a2s  .co  m
        String term;
        double totalWikiDocs = (double) reader.numDocs();
        for (TFIDFTerm word : wordList) {
            try {
                term = "";
                stream = analyzer.tokenStream("field", new StringReader(word.word));
                termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                // print all tokens until stream is exhausted
                while (stream.incrementToken()) {
                    term += (termAtt.toString());
                }
                //                System.out.println(term);
                stream.end();
                tf = (double) word.count / (double) totalWordsDoc;
                double wikiTermFrec = reader.docFreq(new Term("contents", term));
                if (wikiTermFrec != 0) {
                    idf = Math.log(totalWikiDocs / wikiTermFrec);
                    tfidf = tf * idf;
                } else {
                    tfidf = 0;
                }
                word.tfidf = tfidf;
            } catch (IOException ex) {
                logger.error("Error processing the TFIDF", ex);
            } finally {
                try {
                    if (stream != null) {
                        stream.close();
                    }
                } catch (IOException ex) {
                    logger.error("Error processing the TFIDF", ex);
                }

            }

        }
        try {
            reader.close();
        } catch (IOException ex) {
            logger.warn("Error closing lucene reader", ex);
        }
    }
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    long initCPU = System.nanoTime();
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            String term = new String(termAtt.buffer(), 0, termAtt.length());
            document.add(term);//  w  w  w  .j  a  v a  2 s  . c  o m
            numTerms++;
        }
    }
    elapsedTime += System.nanoTime() - initCPU;

    context.write(key, document);
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java

License:Apache License

private static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }//  w  ww .  j  a  v  a  2 s  .  c o  m
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.io.SparseVectorUtil.java

License:Apache License

public static Vector transform2Vector(String text, String field, Analyzer analyzer, HTRCFilter filter,
        Dictionary dictionary) throws IOException {
    Vector result = new RandomAccessSparseVector(dictionary.size());

    TokenStream stream = analyzer.reusableTokenStream(field, new StringReader(text.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        // String term = new String(termAtt.buffer(), 0,
        // termAtt.length());
        String term = new String(termAtt.buffer(), 0, termAtt.length()).toLowerCase();
        if (filter.accept(term, 0)) {
            int index = dictionary.get(term);
            result.setQuick(index, result.get(index) + 1);
        }/* w w w. ja v  a  2s  .com*/
    }

    return result;
}

From source file:edu.indiana.d2i.htrc.util.filter.HTRCFilterAnalyzerTest.java

License:Apache License

public static void main(String[] args) throws IOException {
    HTRCFilterAnalyzer analyzer = new HTRCFilterAnalyzer();

    TokenStream stream = analyzer.reusableTokenStream("field",
            new StringReader("a iss Pierre 1 Vinken , 61 years old , "
                    + "will join the board as joins a nonexecutive joining director Nov. "
                    + "29 .Mr. car Vinken is cars chairman of Elsevier N.V. , the Dutch "
                    + "publishing group ."));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            System.out.println(new String(termAtt.buffer(), 0, termAtt.length()));
        }//from   w ww .  j  a v  a  2s  .  c  o  m
    }

    System.out.println("Done???");
}