Example usage for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:sh.isaac.provider.query.lucene.LuceneIndexer.java

License:Apache License

/**
 * Builds the prefix query./*ww w.java 2 s.c  o m*/
 *
 * @param searchString the search string
 * @param field the field
 * @param analyzer the analyzer
 * @return the query
 * @throws IOException Signals that an I/O exception has occurred.
 */
protected Query buildPrefixQuery(String searchString, String field, Analyzer analyzer) throws IOException {
    final TokenStream tokenStream;
    final List<String> terms;
    try (StringReader textReader = new StringReader(searchString)) {
        tokenStream = analyzer.tokenStream(field, textReader);
        tokenStream.reset();
        terms = new ArrayList<>();
        final CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        while (tokenStream.incrementToken()) {
            terms.add(charTermAttribute.toString());
        }
    }
    tokenStream.close();
    analyzer.close();

    final BooleanQuery.Builder bq = new BooleanQuery.Builder();

    if ((terms.size() > 0) && !searchString.endsWith(" ")) {
        final String last = terms.remove(terms.size() - 1);

        bq.add(new PrefixQuery((new Term(field, last))), Occur.MUST);
    }

    terms.stream().forEach((s) -> {
        bq.add(new TermQuery(new Term(field, s)), Occur.MUST);
    });
    return bq.build();
}

From source file:snu.controladores.indexador.Parser.java

/**
 * Realiza a tokenizao de uma string (Pega as palavras com split e extrai
 * seu radical)/*from   ww  w .  j av a  2  s.c o m*/
 *
 * @param analyzer
 * @param string
 * @return
 * @throws IOException
 */
private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<>();
    TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
    stream.reset();

    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:snu.controladores.indexador.ProcessadorDeConsultas.java

/**
 * Realiza a tokenizao de uma string (Pega as palavras com split e extrai
 * seu radical)//from  w  w  w.  ja  va2  s  . c o  m
 *
 * @param analyzer
 * @param string
 * @return
 * @throws IOException
 */
private List<String> tokenizeString(Analyzer analyzer, String string) throws IOException {
    List<String> result = new ArrayList<>();

    TokenStream stream = analyzer.tokenStream(null, new StringReader(string));
    stream.reset();
    while (stream.incrementToken()) {
        result.add(stream.getAttribute(CharTermAttribute.class).toString());
    }
    return result;
}

From source file:stackoverflow.lucene.modified.MoreLikeThis.java

License:Apache License

/**
 * Adds term frequencies found by tokenizing text from reader into the Map words
 *
 * @param r a source of text to be tokenized
 * @param termFreqMap a Map of terms and their frequencies
 * @param fieldName Used by analyzer for any special per-field analysis
 */// w w w  .j  ava 2  s  .c  om
private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException {
    if (analyzer == null) {
        throw new UnsupportedOperationException(
                "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer");
    }
    TokenStream ts = analyzer.tokenStream(fieldName, r);
    int tokenCount = 0;
    // for every token
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if (tokenCount > maxNumTokensParsed) {
            break;
        }
        if (isNoiseWord(word)) {
            continue;
        }

        // increment frequency
        Int cnt = termFreqMap.get(word);
        if (cnt == null) {
            termFreqMap.put(word, new Int());
        } else {
            cnt.x++;
        }
    }
    ts.end();
    ts.close();
}

From source file:StopWords.StopWords.java

public String removeStopwords(String input) {
    TokenStream tokenStream = new ClassicTokenizer(Version.LUCENE_35, new StringReader(input));
    // remove stop words
    tokenStream = new StopFilter(Version.LUCENE_35, tokenStream, EnglishAnalyzer.getDefaultStopSet());

    // retrieve the remaining tokens
    Set<String> tokens = new HashSet<String>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    String str = "";
    try {//from   w w  w. j a  va 2 s. c o m
        tokenStream.reset();
    } catch (IOException ex) {
        Logger.getLogger(StopWords.class.getName()).log(Level.SEVERE, null, ex);
    }
    try {
        while (tokenStream.incrementToken()) {
            tokens.add(token.toString());
            str += token.toString() + " ";
            //System.out.println(token.toString());
        }
    } catch (IOException e) {
        // log
    }
    return str;
}

From source file:stroom.search.server.TestStandardAnalyser.java

License:Apache License

private void testAnalyser(final String input, final Analyzer analyzer) throws Exception {
    System.out.println("Testing analyser: " + analyzer.getClass().getName());

    final ReusableStringReader reader = new ReusableStringReader();
    reader.init(input);//from   ww  w.  ja  v  a 2s .co m

    final TokenStream stream = analyzer.tokenStream("Test", reader);

    // reset the TokenStream to the first token
    stream.reset();

    boolean hasMoreTokens = stream.incrementToken();

    final CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

    for (;;) {
        if (!hasMoreTokens)
            break;

        // Get the text of this term.
        final char[] tokenText = termAtt.buffer();
        final int tokenTextLen = termAtt.length();

        System.out.println(new String(tokenText, 0, tokenTextLen));

        hasMoreTokens = stream.incrementToken();
    }
}

From source file:summarizer.KeywordsGuesser.java

License:Open Source License

public static String stemmize(String term) throws IOException {

    TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(term));

    tokenStream = new PorterStemFilter(tokenStream);

    Set<String> stems = new HashSet<String>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);

    tokenStream.reset();
    while (tokenStream.incrementToken()) {

        stems.add(token.toString());/*from   w  w w  . j av  a 2 s  . co m*/
    }

    if (stems.size() != 1) {
        return null;
    }

    String stem = stems.iterator().next();

    if (!stem.matches("[\\w-]+")) {
        return null;
    }

    return stem;
}

From source file:summarizer.KeywordsGuesser.java

License:Open Source License

public static List<Keyword> guessFromString(String input) throws IOException {

    input = input.replaceAll("-+", "-0");
    input = input.replaceAll("[\\p{Punct}&&[^'-]]+", " ");
    input = input.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    TokenStream tokenStream = new ClassicTokenizer(LUCENE_VERSION, new StringReader(input));
    tokenStream = new LowerCaseFilter(LUCENE_VERSION, tokenStream);
    tokenStream = new ClassicFilter(tokenStream);
    tokenStream = new ASCIIFoldingFilter(tokenStream);
    tokenStream = new StopFilter(LUCENE_VERSION, tokenStream, EnglishAnalyzer.getDefaultStopSet());
    List<Keyword> keywords = new LinkedList<Keyword>();
    CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class);
    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        String term = token.toString();
        String stem = stemmize(term);
        if (stem != null) {
            Keyword keyword = find(keywords, new Keyword(stem.replaceAll("-0", "-")));
            keyword.add(term.replaceAll("-0", "-"));
        }//  ww  w .  j av a 2  s .com
    }
    Collections.sort(keywords);
    return keywords;
}

From source file:test.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokens(TokenStream stream) throws IOException {
    stream.reset();
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        System.out.print("[" + term + "] "); //B
    }//from   w w w. j  a  va  2 s  .  c o  m
    stream.close();
}

From source file:test.analysis.AnalyzerUtils.java

License:Apache License

public static void displayTokensWithPositions(Analyzer analyzer, String text) throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    stream.reset();
    int position = 0;
    while (stream.incrementToken()) {
        int increment = posIncr.getPositionIncrement();
        if (increment > 0) {
            position = position + increment;
            System.out.println();
            System.out.print(position + ": ");
        }//from   www.  ja v  a2  s. c  o m

        System.out.print("[" + term + "] ");
    }
    System.out.println();
    stream.close();
}