Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.doculibre.constellio.lucene.BaseLuceneIndexHelper.java

License:Open Source License

public static String analyze(String str, Analyzer analyzer) throws IOException {
    if (analyzer == null) {
        return str;
    }/*www.j av  a 2 s. c  o  m*/
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(str));
    tokens.reset();

    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    return norm.toString();
}

From source file:com.doculibre.constellio.utils.AnalyzerUtils.java

License:Open Source License

public static String analyzePhrase(String phrase, boolean useStopWords) {
    if (StringUtils.isNotBlank(phrase)) {
        String analysedPhrase;// www.j  av a 2s . c  o m
        Analyzer analyzer = getDefaultAnalyzer(useStopWords);

        StringBuilder norm = new StringBuilder();
        TokenStream tokens;
        try {
            tokens = analyzer.tokenStream("", new StringReader(phrase));
            tokens.reset();

            CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
            while (tokens.incrementToken()) {
                norm.append(termAtt.buffer(), 0, termAtt.length());
            }

            analysedPhrase = norm.toString().trim();
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        return analysedPhrase;
    } else {
        return phrase;
    }
}

From source file:com.faqit.similarity.NGramExtractor.java

License:Open Source License

/**
 * Extracts NGrams from a String of text. Can handle ngrams of any length
 * and also perform stop word removal before extraction
 * //from   www.  ja v  a2s .com
 * @param text
 *            the text that the ngrams should be extracted from
 * @param length
 *            the length of the ngrams
 * @param stopWords
 *            whether or not stopwords should be removed before extraction
 * @param overlap
 *            whether or not the ngrams should overlap
 */
public void extract(String text, int length, Boolean stopWords, Boolean overlap)
        throws FileNotFoundException, IOException {

    this.text = text;
    this.length = length;
    this.stopWords = stopWords;
    this.overlap = overlap;

    nGrams = new LinkedList<String>();
    uniqueNGrams = new LinkedList<String>();
    nGramFreqs = new HashMap<String, Integer>();

    /*
     * If the minLength and maxLength are both 1, then we want unigrams Make
     * use of a StopAnalyzer when stopwords should be removed Make use of a
     * SimpleAnalyzer when stop words should be included
     */
    if (length == 1) {
        if (this.stopWords) {
            analyzer = new StandardAnalyzer();
        } else {
            analyzer = new SimpleAnalyzer();
        }
    } else { // Bigger than unigrams so use ShingleAnalyzerWrapper. Once
             // again, different analyzers depending on stop word removal
        if (this.stopWords) {
            analyzer = new ShingleAnalyzerWrapper(new StopAnalyzer(), length, length, " ", false, false, ""); // This is a
            // hack to use
            // Lucene 2.4
            // since in 2.4
            // position
            // increments
            // weren't
            // preserved by
            // default.
            // Using a later
            // version puts
            // underscores
            // (_) in the
            // place of
            // removed stop
            // words.
        } else {
            analyzer = new ShingleAnalyzerWrapper(new SimpleAnalyzer(), length, length, " ", false, false, "");
        }
    }

    // Code to process and extract the ngrams
    TokenStream tokenStream = analyzer.tokenStream("text", new StringReader(this.text));
    // OffsetAttribute offsetAttribute =
    // tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    // int tokenCount = 0;
    tokenStream.reset();
    while (tokenStream.incrementToken()) {

        // int startOffset = offsetAttribute.startOffset();
        // int endOffset = offsetAttribute.endOffset();
        String termToken = charTermAttribute.toString(); // The actual token
        // term
        nGrams.add(termToken); // Add all ngrams to the ngram LinkedList

        // If n-grams are not allowed to overlap, then increment to point of
        // no overlap
        if (!overlap) {
            for (int i = 0; i < length - 1; i++) {
                tokenStream.incrementToken();
            }
        }

    }

    // Store unique nGrams and frequencies in hash tables
    for (String nGram : nGrams) {
        if (nGramFreqs.containsKey(nGram)) {
            nGramFreqs.put(nGram, nGramFreqs.get(nGram) + 1);
        } else {
            nGramFreqs.put(nGram, 1);
            uniqueNGrams.add(nGram);
        }
    }

}

From source file:com.finderbots.miner.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
    TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);

    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {/*from   w w w.ja v  a2 s .c  o  m*/
        while (stream.incrementToken()) {
            if (termAtt.termLength() > 0) {
                String term = termAtt.term();
                result.add(term);
            }
        }
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:com.flaptor.indextank.query.IndexEngineParser.java

License:Apache License

public Iterator<AToken> parseDocumentField(String fieldName, String content) {
    final TokenStream tkstream = analyzer.tokenStream(fieldName, new StringReader(content));
    final TermAttribute termAtt = tkstream.addAttribute(TermAttribute.class);
    final PositionIncrementAttribute posIncrAttribute = tkstream.addAttribute(PositionIncrementAttribute.class);
    final OffsetAttribute offsetAtt = tkstream.addAttribute(OffsetAttribute.class);

    return new AbstractIterator<AToken>() {
        int currentPosition = 0;

        @Override//  www.j av a 2  s  .  com
        protected AToken computeNext() {
            try {
                if (!tkstream.incrementToken()) {
                    tkstream.end();
                    tkstream.close();
                    return endOfData();
                }
            } catch (IOException e) {
                //This should never happen, as the reader is a StringReader
            }
            //final org.apache.lucene.analysis.Token luceneTk = tkstream.getAttribute(org.apache.lucene.analysis.Token.class);
            currentPosition += posIncrAttribute.getPositionIncrement();
            final int position = currentPosition;
            final int startOffset = offsetAtt.startOffset();
            final int endOffset = offsetAtt.endOffset();
            final String text = termAtt.term();
            return new AToken() {
                @Override
                public String getText() {
                    return text; //luceneTk.term();
                }

                @Override
                public int getPosition() {
                    return position; //luceneTk.getPositionIncrement();
                }

                @Override
                public int getStartOffset() {
                    return startOffset;
                }

                @Override
                public int getEndOffset() {
                    return endOffset;
                }
            };
        }
    };

}

From source file:com.fujitsu.ca.fic.caissepop.evaluation.TokenizeText.java

License:Apache License

@Override
public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 1 || input.isNull(0)) {
        return null;
    }// w w  w .  ja v  a2s  .co m

    DataBag bagOfTokens = bagFactory.newDefaultBag();
    TokenStream tokenStream = null;
    try {
        String lineOfText = input.get(0).toString();
        StringReader textInput = new StringReader(lineOfText);
        tokenStream = analyzer.tokenStream(noField, textInput);
        CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();

        while (tokenStream.incrementToken()) {
            Tuple termText = tupleFactory.newTuple(termAttribute.toString());
            bagOfTokens.add(termText);
            termAttribute.setEmpty();
        }
    } finally {
        if (tokenStream != null) {
            tokenStream.close();
        }
    }
    return bagOfTokens;
}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

private String analyzeQuery(String query, Analyzer analyzer) {

    if (analyzer != null && query != null && query.length() > 0) {
        TokenStream tokenStream = analyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME, new StringReader(query));

        StringBuilder newQueryB = new StringBuilder();
        try {/*from  ww w  .  j  ava2  s  .  c o m*/
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
                // OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
                // TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

                newQueryB.append(term.toString());
                newQueryB.append(' ');

            }
            tokenStream.end();
            return newQueryB.toString().trim();

        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        } finally {
            try {
                tokenStream.close();
            } catch (IOException e) {
                throw new RuntimeException("uncaught exception in synonym processing", e);
            }
        }
    }

    return query;

}

From source file:com.github.healthonnet.search.SynonymExpandingExtendedDismaxQParserPlugin.java

License:Apache License

/**
 * Given the synonymAnalyzer, returns a list of all alternate queries expanded from the original user query.
 * /* w  w w  .j  av a  2s .c  o m*/
 * @param synonymAnalyzer
 * @param solrParams
 * @return
 */
private List<Query> generateSynonymQueries(Analyzer synonymAnalyzer, SolrParams solrParams) {

    String origQuery = getQueryStringFromParser();
    int queryLen = origQuery.length();

    // TODO: make the token stream reusable?
    TokenStream tokenStream = synonymAnalyzer.tokenStream(Const.IMPOSSIBLE_FIELD_NAME,
            new StringReader(origQuery));

    SortedSetMultimap<Integer, TextInQuery> startPosToTextsInQuery = TreeMultimap.create();

    boolean constructPhraseQueries = solrParams.getBool(Params.SYNONYMS_CONSTRUCT_PHRASES, false);

    boolean bag = solrParams.getBool(Params.SYNONYMS_BAG, false);
    List<String> synonymBag = new ArrayList<>();

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute term = tokenStream.getAttribute(CharTermAttribute.class);
            OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
            TypeAttribute typeAttribute = tokenStream.getAttribute(TypeAttribute.class);

            if (!typeAttribute.type().equals("shingle")) {
                // ignore shingles; we only care about synonyms and the original text
                // TODO: filter other types as well

                String termToAdd = term.toString();

                if (typeAttribute.type().equals("SYNONYM")) {
                    synonymBag.add(termToAdd);
                }

                // Don't quote sibgle term term synonyms
                if (constructPhraseQueries && typeAttribute.type().equals("SYNONYM")
                        && termToAdd.contains(" ")) {
                    // Don't Quote when original is already surrounded by quotes
                    if (offsetAttribute.startOffset() == 0 || offsetAttribute.endOffset() == queryLen
                            || origQuery.charAt(offsetAttribute.startOffset() - 1) != '"'
                            || origQuery.charAt(offsetAttribute.endOffset()) != '"') {
                        // make a phrase out of the synonym
                        termToAdd = new StringBuilder(termToAdd).insert(0, '"').append('"').toString();
                    }
                }
                if (!bag) {
                    // create a graph of all possible synonym combinations,
                    // e.g. dog bite, hound bite, dog nibble, hound nibble, etc.
                    TextInQuery textInQuery = new TextInQuery(termToAdd, offsetAttribute.startOffset(),
                            offsetAttribute.endOffset());

                    startPosToTextsInQuery.put(offsetAttribute.startOffset(), textInQuery);
                }
            }
        }
        tokenStream.end();
    } catch (IOException e) {
        throw new RuntimeException("uncaught exception in synonym processing", e);
    } finally {
        try {
            tokenStream.close();
        } catch (IOException e) {
            throw new RuntimeException("uncaught exception in synonym processing", e);
        }
    }

    List<String> alternateQueries = synonymBag;

    if (!bag) {
        // use a graph rather than a bag
        List<List<TextInQuery>> sortedTextsInQuery = new ArrayList<>(startPosToTextsInQuery.values().size());
        sortedTextsInQuery.addAll(startPosToTextsInQuery.asMap().values().stream().map(ArrayList::new)
                .collect(Collectors.toList()));

        // have to use the start positions and end positions to figure out all possible combinations
        alternateQueries = buildUpAlternateQueries(solrParams, sortedTextsInQuery);
    }

    // save for debugging purposes
    expandedSynonyms = alternateQueries;

    return createSynonymQueries(solrParams, alternateQueries);
}

From source file:com.github.ippeiukai.externaltoken.lucene.analysis.TestPatternTokenizer.java

License:Apache License

/** 
 * TODO: rewrite tests not to use string comparison.
 *//* w  w  w  . j av  a 2s .c o m*/
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    while (in.incrementToken()) {
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
        out.append(' ');
    }
    if (out.length() > 0)
        out.deleteCharAt(out.length() - 1);

    in.close();
    return out.toString();
}

From source file:com.github.le11.nls.lucene.UIMABaseAnalyzerTest.java

License:Apache License

@Test
public void baseUIMAAnalyzerStreamTest() {
    try {/* www.j  a  va2s  . c o m*/
        TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
            assertNotNull(offsetAtt);
            assertNotNull(termAtt);
            System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + ","
                    + offsetAtt.endOffset());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getLocalizedMessage());
    }
}