Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:de.blizzy.documentr.search.PageIndex.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {//from w  w w  . j a  v a2 s .c o m
        tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Closeables.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.catma.indexer.PunctuationTokenizer.java

License:Open Source License

/**
 * Constructor.//from  w w w .  j a  v  a2  s  . c  o  m
 *
 * @param input the input stream
 * @param unseparableCharacterSequences the list of unseparable character sequences
 * @param userDefSeparatingPunctuationPattern a pattern of OR-ed user defined
 *  separating punctuation characters
 * @param locale the locale of the main language of the content
 */
public PunctuationTokenizer(TokenStream input, CharTree unseparableCharacterSequences,
        Pattern userDefSeparatingPunctuationPattern, Locale locale) {
    super(input);
    termInfoBuffer = new ArrayDeque<TermInfo>();
    offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class);
    termAtt = (CharTermAttribute) input.getAttribute(CharTermAttribute.class);
    ucAtt = (UnseparableCharacterSequenceAttribute) input
            .getAttribute(UnseparableCharacterSequenceAttribute.class);

    treeRoot = unseparableCharacterSequences;
    if (treeRoot == null) {
        treeRoot = CharTree.EMPTY_TREE;
    }
    this.userDefSeparatingPunctuationPattern = userDefSeparatingPunctuationPattern;
    this.locale = locale;
}

From source file:de.catma.indexer.TermExtractor.java

License:Open Source License

private void extractTermInfos(String content, List<String> unseparableCharacterSequences,
        List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException {

    terms = new HashMap<String, List<TermInfo>>();
    termsInOrder = new ArrayList<String>();

    if (locale == null) {
        locale = Locale.getDefault();
    }//from   w w w .  j  a va 2 s . c  om

    WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer(
            unseparableCharacterSequences, userDefinedSeparatingCharacters, locale);

    TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use
            // the fieldname
            new StringReader(content));

    int positionCounter = 0;
    while (ts.incrementToken()) {
        CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);

        TermInfo ti = new TermInfo(termAttr.toString(), offsetAttr.startOffset(), offsetAttr.endOffset(),
                positionCounter);

        if (!terms.containsKey(ti.getTerm())) {
            terms.put(ti.getTerm(), new ArrayList<TermInfo>());
        }
        terms.get(ti.getTerm()).add(ti);
        positionCounter++;

        termsInOrder.add(ti.getTerm());
    }
}

From source file:de.catma.indexer.WildcardTermExtractor.java

License:Open Source License

private void extractTermInfosWithWildcards(String content, List<String> unseparableCharacterSequences,
        List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException {

    if (locale == null) {
        locale = Locale.getDefault();
    }/*  w  w  w .  j a v a 2 s.  com*/

    WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer(
            unseparableCharacterSequences, userDefinedSeparatingCharacters, locale);

    TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use
            // the fieldname
            new StringReader(content));

    WildcardParser wildcardParser = new WildcardParser();
    while (ts.incrementToken()) {
        CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
        wildcardParser.handle(termAttr, offsetAttr);
    }
    wildcardParser.finish();

    orderedTerms = new ArrayList<String>();
    for (TermInfo ti : wildcardParser.getOrderedTermInfos()) {
        orderedTerms.add(ti.getTerm());
    }

}

From source file:de.jetwick.es.JetwickQuery.java

License:Apache License

public Set<String> doSnowballStemming(TokenStream ts) {
    Set<String> res = new LinkedHashSet<String>();
    ts = new SnowballFilter(ts, "English");
    try {/*w  w w.  j  ava  2s .c o m*/
        while (ts.incrementToken()) {
            res.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException ex) {
        logger.error("Exception while stemming to snoball", ex);
    }

    return res;
}

From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java

License:Open Source License

private String analyze(String aFieldName, String aString) throws IOException {
    TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString);
    theTokenStream.reset();/*from   ww w .j  a  v  a  2 s  . co m*/
    CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class);
    try {
        if (theTokenStream.incrementToken()) {
            return theCharTerms.toString();
        }
        return null;
    } finally {
        theTokenStream.end();
        theTokenStream.close();
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addWildcardOrTermQueries(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    Query theTempQuery;/*from   w  ww .j a v  a  2s .c  o  m*/

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        if (isWildcardTerm(aTerm)) {
            theTempQuery = new WildcardQuery(new Term(aField, getCorrectedWildcardTerm(aTerm)));
        } else {
            theTempQuery = new TermQuery(new Term(aField, theTokenText));
        }
        aQuery.add(theTempQuery, Occur.MUST);
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery();

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        Term theTerm = new Term(aField, theTokenText);

        if (!isWildcardTerm(theTokenText)) {
            thePhraseQuery.add(theTerm);
        } else {/*  ww w  .ja v  a 2 s  .  c  o  m*/
            Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text()));
            WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm);
            try {
                List<Term> theTerms = new ArrayList<Term>();
                do {
                    theTerms.add(theEnum.term());
                } while (theEnum.next());
                thePhraseQuery.add(theTerms.toArray(new Term[0]));
            } finally {
                theEnum.close();
            }
        }
    }

    aQuery.add(thePhraseQuery, Occur.MUST);
}

From source file:de.twitterlivesearch.analysis.Tokenizer.java

License:Apache License

/**
 * @param stringToAnalyze//from  w w w .j  av  a 2  s.co m
 *            String to be tokenized
 * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used
 *        for analysis
 *
 * @return list of tokens
 */
public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze));
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return tokens;
}

From source file:de.unidue.inf.is.ezdl.dlcore.data.extractor.TermExtractor.java

License:Open Source License

/**
 * Split the information cause in sense of term it is a standalone word.
 * TODO this method removes stopwords but don't detect any phrases.
 * /*from w w w.j  a va  2 s  . co  m*/
 * @param result
 *            the list we will append the items
 * @param item
 *            the item itself.
 */
private void add(ExtractionResultImpl result, String item) {
    if (item != null) {
        inferLanguage(item);
        List<String> terms = new ArrayList<String>();

        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(item));
        // OffsetAttribute offsetAttribute =
        // tokenStream.getAttribute(OffsetAttribute.class);
        TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);

        try {
            while (tokenStream.incrementToken()) {
                // int startOffset = offsetAttribute.startOffset();
                // int endOffset = offsetAttribute.endOffset();
                String term = termAttribute.term();
                terms.add(term);
            }
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
        }

        terms = filter.filter(terms, locale);

        for (String t : terms) {
            if (!StringUtils.isEmpty((t))) {
                Entry e = new EntryImpl(t.toLowerCase(locale));
                result.add(e);
            }
        }
    }
}