Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:ddf.catalog.pubsub.criteria.contextual.ContextualEvaluator.java

License:Open Source License

private static void logTokens(Analyzer analyzer, String fieldName, String fullDocument, String analyzerName)
        throws IOException {
    if (!LOGGER.isDebugEnabled()) {
        return;/*w  ww .  ja va  2  s  . c  om*/
    }

    TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(fullDocument));
    OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
    TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);
    LOGGER.debug("-----  {} tokens  -----", analyzerName);
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        String term = termAttribute.term();
        LOGGER.debug(term);
    }
    LOGGER.debug("-----  END:  {} tokens  -----", analyzerName);
}

From source file:de.berlinbuzzwords.AnalyzerPrinter.java

License:Apache License

public void printTerms(Analyzer analyzer, String text) throws IOException {
    // Create token stream from reader
    TokenStream stream = analyzer.tokenStream("dummyField", new StringReader(text));

    // Reset stream before token consumption
    stream.reset();/*w w w  .j av a 2s.  c om*/

    // Attribute to get the term text for a token
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

    // Output source text
    System.out.println("\ntext: " + text);

    // Analyze text and iterate until end of input
    while (stream.incrementToken()) {
        // Output term text
        System.out.println("  term: " + termAttr);
    }
}

From source file:de.berlinbuzzwords.AnalyzerPrinter.java

License:Apache License

public void printTokenDetails(Analyzer analyzer, String text) throws IOException {
    // Create token stream from reader
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));

    // Reset stream before token consumption
    stream.reset();// w  w  w . j ava 2s  .c  o  m

    // Attribute to get the token term text, offset and type
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);

    // Output source text
    System.out.println("text: " + text);

    // Analyze text and iterate until end of input
    while (stream.incrementToken()) {
        // Output term text, type and offset
        System.out.println("term: " + termAttr + "\ttype: " + typeAttr.type() + "\tstart offset: "
                + offsetAttr.startOffset() + "\tend offset: " + offsetAttr.endOffset());
    }
}

From source file:de.blizzy.documentr.search.PageFinder.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {/*from www.  j a va  2 s. c  om*/
        tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Util.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.blizzy.documentr.search.PageIndex.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {/*from   ww w  . j  a v a  2 s  .com*/
        tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Closeables.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.catma.indexer.TermExtractor.java

License:Open Source License

private void extractTermInfos(String content, List<String> unseparableCharacterSequences,
        List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException {

    terms = new HashMap<String, List<TermInfo>>();
    termsInOrder = new ArrayList<String>();

    if (locale == null) {
        locale = Locale.getDefault();
    }/*www. ja v a 2s . c  om*/

    WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer(
            unseparableCharacterSequences, userDefinedSeparatingCharacters, locale);

    TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use
            // the fieldname
            new StringReader(content));

    int positionCounter = 0;
    while (ts.incrementToken()) {
        CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);

        TermInfo ti = new TermInfo(termAttr.toString(), offsetAttr.startOffset(), offsetAttr.endOffset(),
                positionCounter);

        if (!terms.containsKey(ti.getTerm())) {
            terms.put(ti.getTerm(), new ArrayList<TermInfo>());
        }
        terms.get(ti.getTerm()).add(ti);
        positionCounter++;

        termsInOrder.add(ti.getTerm());
    }
}

From source file:de.catma.indexer.WildcardTermExtractor.java

License:Open Source License

private void extractTermInfosWithWildcards(String content, List<String> unseparableCharacterSequences,
        List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException {

    if (locale == null) {
        locale = Locale.getDefault();
    }/*from   w w  w. j  av  a2 s  .c om*/

    WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer(
            unseparableCharacterSequences, userDefinedSeparatingCharacters, locale);

    TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use
            // the fieldname
            new StringReader(content));

    WildcardParser wildcardParser = new WildcardParser();
    while (ts.incrementToken()) {
        CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
        wildcardParser.handle(termAttr, offsetAttr);
    }
    wildcardParser.finish();

    orderedTerms = new ArrayList<String>();
    for (TermInfo ti : wildcardParser.getOrderedTermInfos()) {
        orderedTerms.add(ti.getTerm());
    }

}

From source file:de.csw.ontology.XWikiTextEnhancer.java

License:Open Source License

/**
 * The enhanced text contains links to the Lucene search page of the xWiki
 * system. The search terms are related to the annotated phrase.
 *///  w w  w.  j a  va 2 s. c  om
public String enhance(String text) {
    CSWGermanAnalyzer ga = new CSWGermanAnalyzer();
    TokenStream ts = null;
    StringBuilder result = new StringBuilder();

    initializeLinkIndex(text);

    try {
        Reader r = new BufferedReader(new StringReader(text));

        ts = ga.tokenStream("", r);

        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttribute = ts.addAttribute(TypeAttribute.class);

        String term;
        int lastEndIndex = 0;

        while (ts.incrementToken()) {

            result.append(text.substring(lastEndIndex, offsetAttribute.startOffset()));
            term = String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length());

            if (typeAttribute.type().equals(ConceptFilter.CONCEPT_TYPE) && isAnnotatable(offsetAttribute)) {
                log.debug("Annotating concept: " + term);
                annotateWithSearch(result,
                        text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()), term);
            } else {
                result.append(text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()));
            }

            lastEndIndex = offsetAttribute.endOffset();
        }
        result.append(text.subSequence(lastEndIndex, text.length()));
    } catch (IOException e) {
        Log.error("Error while processing the page content", e);
    }

    ga.close();
    return result.toString();
}

From source file:de.escidoc.sb.common.lucene.analyzer.EscidocJapaneseAnalyzer.java

License:Open Source License

/**
 * Constructs a token stream with JapaneseAnalyzer or WhitespaceTokenizer
 * depending if text is japanese or not.
 * /*from  w ww. j  a v a  2s . c o m*/
 * @param fieldName
 *            name of the Lucene Indexfield.
 * @param reader
 *            reader with field-value
 * 
 * @return TokenStream tokenStream
 * 
 * @sb
 */
@Override
public TokenStream tokenStream(final String fieldName, final Reader reader) {
    if (log.isDebugEnabled()) {
        log.debug("tokenizing with EscidocJapaneseAnalyzer");
    }
    //checkJapanese ///////////////////////////////////////////////////////
    boolean isJapanese = false;
    TokenStream whitespaceTokens = new WhitespaceTokenizer(Constants.LUCENE_VERSION, reader);
    Reader reader1 = null;
    try {
        StringBuffer tokenBuffer = new StringBuffer("");
        CharTermAttribute termAtt = whitespaceTokens.addAttribute(CharTermAttribute.class);
        whitespaceTokens.reset();
        while (whitespaceTokens.incrementToken()) {
            if (tokenBuffer.length() > 0) {
                tokenBuffer.append(" ");
            }
            tokenBuffer.append(termAtt.toString());
        }
        for (int i = 0; i < tokenBuffer.length(); i++) {
            int hexInt = Integer.parseInt(charToHex(tokenBuffer.charAt(i)), 16);
            if (hexInt > 12287 && hexInt < 13328) {
                isJapanese = true;
                break;
            }
        }
        reader1 = new StringReader(tokenBuffer.toString());
    } catch (Exception e) {
        log.error(e);
    }
    ///////////////////////////////////////////////////////////////////////

    //No Japanese, so return whitespace-tokens
    if (!isJapanese) {
        TokenStream result = new XmlWhitespaceTokenizer(reader1);
        result = new JunkFilter(result);
        result = new LowerCaseFilter(Constants.LUCENE_VERSION, result);
        return result;
    }

    //Get Japanese Tokens
    JapaneseAnalyzer analyzer = new JapaneseAnalyzer(Constants.LUCENE_VERSION);
    TokenStream japaneseTokens = analyzer.tokenStream("", reader1);
    if (analyzer != null) {
        try {
            analyzer.close();
        } catch (Exception e) {
        }
    }
    return japaneseTokens;
}

From source file:de.escidoc.sb.common.lucene.analyzer.Test.java

License:Open Source License

/**
 * @param args/* ww  w.  j a  v  a 2s  . c o m*/
 */
public static void main(String[] args) {
    try {
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader("ome text goes here"));
        ts = new GreekFilter(ts);
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + termAtt.toString());
        }
    } catch (Exception e) {
        System.out.println(e);
    }
}