Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:de.berlinbuzzwords.AnalyzerPrinter.java

License:Apache License

public void printTerms(Analyzer analyzer, String text) throws IOException {
    // Create token stream from reader
    TokenStream stream = analyzer.tokenStream("dummyField", new StringReader(text));

    // Reset stream before token consumption
    stream.reset();/*from ww  w.  j av  a  2s.  co  m*/

    // Attribute to get the term text for a token
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);

    // Output source text
    System.out.println("\ntext: " + text);

    // Analyze text and iterate until end of input
    while (stream.incrementToken()) {
        // Output term text
        System.out.println("  term: " + termAttr);
    }
}

From source file:de.berlinbuzzwords.AnalyzerPrinter.java

License:Apache License

public void printTokenDetails(Analyzer analyzer, String text) throws IOException {
    // Create token stream from reader
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));

    // Reset stream before token consumption
    stream.reset();//ww w.  j  a va  2s . c om

    // Attribute to get the token term text, offset and type
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAttr = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);

    // Output source text
    System.out.println("text: " + text);

    // Analyze text and iterate until end of input
    while (stream.incrementToken()) {
        // Output term text, type and offset
        System.out.println("term: " + termAttr + "\ttype: " + typeAttr.type() + "\tstart offset: "
                + offsetAttr.startOffset() + "\tend offset: " + offsetAttr.endOffset());
    }
}

From source file:de.blizzy.documentr.search.PageFinder.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {/*from   w  w w. j  a v  a2  s . c om*/
        tokenStream = analyzer.tokenStream(PageIndex.ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Util.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(PageIndex.ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.blizzy.documentr.search.PageIndex.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {//from  ww w  .j av  a  2s  . c o  m
        tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Closeables.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.csw.ontology.XWikiTextEnhancer.java

License:Open Source License

/**
 * The enhanced text contains links to the Lucene search page of the xWiki
 * system. The search terms are related to the annotated phrase.
 *//* w  w w  . j  a v a2  s .  c  o  m*/
public String enhance(String text) {
    CSWGermanAnalyzer ga = new CSWGermanAnalyzer();
    TokenStream ts = null;
    StringBuilder result = new StringBuilder();

    initializeLinkIndex(text);

    try {
        Reader r = new BufferedReader(new StringReader(text));

        ts = ga.tokenStream("", r);

        CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
        OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
        TypeAttribute typeAttribute = ts.addAttribute(TypeAttribute.class);

        String term;
        int lastEndIndex = 0;

        while (ts.incrementToken()) {

            result.append(text.substring(lastEndIndex, offsetAttribute.startOffset()));
            term = String.copyValueOf(charTermAttribute.buffer(), 0, charTermAttribute.length());

            if (typeAttribute.type().equals(ConceptFilter.CONCEPT_TYPE) && isAnnotatable(offsetAttribute)) {
                log.debug("Annotating concept: " + term);
                annotateWithSearch(result,
                        text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()), term);
            } else {
                result.append(text.substring(offsetAttribute.startOffset(), offsetAttribute.endOffset()));
            }

            lastEndIndex = offsetAttribute.endOffset();
        }
        result.append(text.subSequence(lastEndIndex, text.length()));
    } catch (IOException e) {
        Log.error("Error while processing the page content", e);
    }

    ga.close();
    return result.toString();
}

From source file:de.escidoc.sb.common.lucene.analyzer.EscidocJapaneseAnalyzer.java

License:Open Source License

/**
 * Constructs a token stream with JapaneseAnalyzer or WhitespaceTokenizer
 * depending if text is japanese or not.
 * //from  www .  j ava  2 s  .c  o  m
 * @param fieldName
 *            name of the Lucene Indexfield.
 * @param reader
 *            reader with field-value
 * 
 * @return TokenStream tokenStream
 * 
 * @sb
 */
@Override
public TokenStream tokenStream(final String fieldName, final Reader reader) {
    if (log.isDebugEnabled()) {
        log.debug("tokenizing with EscidocJapaneseAnalyzer");
    }
    //checkJapanese ///////////////////////////////////////////////////////
    boolean isJapanese = false;
    TokenStream whitespaceTokens = new WhitespaceTokenizer(Constants.LUCENE_VERSION, reader);
    Reader reader1 = null;
    try {
        StringBuffer tokenBuffer = new StringBuffer("");
        CharTermAttribute termAtt = whitespaceTokens.addAttribute(CharTermAttribute.class);
        whitespaceTokens.reset();
        while (whitespaceTokens.incrementToken()) {
            if (tokenBuffer.length() > 0) {
                tokenBuffer.append(" ");
            }
            tokenBuffer.append(termAtt.toString());
        }
        for (int i = 0; i < tokenBuffer.length(); i++) {
            int hexInt = Integer.parseInt(charToHex(tokenBuffer.charAt(i)), 16);
            if (hexInt > 12287 && hexInt < 13328) {
                isJapanese = true;
                break;
            }
        }
        reader1 = new StringReader(tokenBuffer.toString());
    } catch (Exception e) {
        log.error(e);
    }
    ///////////////////////////////////////////////////////////////////////

    //No Japanese, so return whitespace-tokens
    if (!isJapanese) {
        TokenStream result = new XmlWhitespaceTokenizer(reader1);
        result = new JunkFilter(result);
        result = new LowerCaseFilter(Constants.LUCENE_VERSION, result);
        return result;
    }

    //Get Japanese Tokens
    JapaneseAnalyzer analyzer = new JapaneseAnalyzer(Constants.LUCENE_VERSION);
    TokenStream japaneseTokens = analyzer.tokenStream("", reader1);
    if (analyzer != null) {
        try {
            analyzer.close();
        } catch (Exception e) {
        }
    }
    return japaneseTokens;
}

From source file:de.escidoc.sb.common.lucene.analyzer.Test.java

License:Open Source License

/**
 * @param args//from   w ww.  j ava 2 s .  co  m
 */
public static void main(String[] args) {
    try {
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
        TokenStream ts = analyzer.tokenStream("myfield", new StringReader("ome text goes here"));
        ts = new GreekFilter(ts);
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + termAtt.toString());
        }
    } catch (Exception e) {
        System.out.println(e);
    }
}

From source file:de.ingrid.interfaces.csw.tools.LuceneTools.java

License:EUPL

/**
 * @param term//from   ww  w  .j av a  2  s .  c  om
 * @return filtered term
 * @throws IOException
 */
public String filterTerm(String term) throws IOException {
    String result = "";

    // always use same analyzer, NOT new instance ! Is called in mapping process !
    Analyzer myAnalyzer = getAnalyzer();
    TokenStream ts = myAnalyzer.tokenStream(null, new StringReader(term));
    CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);

    while (ts.incrementToken()) {
        String t = charTermAttribute.toString();
        result = result + " " + t;
    }
    return result.trim();
}

From source file:di.uniba.it.tri.occ.BuildOccurrence.java

License:Open Source License

private List<String> getTokens(Reader reader) throws IOException {
    List<String> tokens = new ArrayList<>();
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
    TokenStream tokenStream = analyzer.tokenStream("text", reader);
    tokenStream.reset();/*from   w w  w .ja v  a 2s  . c  o m*/
    CharTermAttribute cattr = tokenStream.addAttribute(CharTermAttribute.class);
    while (tokenStream.incrementToken()) {
        String token = cattr.toString();
        String[] split = token.split("'");
        if (split.length == 1) {
            tokens.add(token);
        } else {
            int max = 0;
            int index = 0;
            for (int i = 0; i < split.length; i++) {
                if (split[i].length() > max) {
                    max = split[i].length();
                    index = i;
                }
            }
            tokens.add(split[index]);
        }
    }
    tokenStream.end();
    return tokens;
}

From source file:doc2vec.LuceneDocIterator.java

String preProcess(Analyzer analyzer, String text) throws Exception {

    StringBuffer tokenizedContentBuff = new StringBuffer();
    TokenStream stream = analyzer.tokenStream("dummy", new StringReader(text));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();/*from   w  w  w  . ja va2  s.  co m*/

    while (stream.incrementToken()) {
        String term = termAtt.toString();
        term = term.toLowerCase();

        if (labelsStoredWithWords) {
            term = term.split("\\" + AMIIndexer.WORD_LABEL_DELIM)[0]; // the first part is the word
        }

        if (!term.trim().equals(""))
            tokenizedContentBuff.append(term).append(" ");
    }

    stream.end();
    stream.close();
    return tokenizedContentBuff.toString();
}