Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:de.blizzy.documentr.search.PageIndex.java

License:Open Source License

private SearchTextSuggestion getSearchTextSuggestion(String searchText, Authentication authentication,
        IndexSearcher searcher) throws IOException, ParseException, TimeoutException {

    List<WordPosition> words = Lists.newArrayList();

    TokenStream tokenStream = null;
    try {//from w  w w  . j a  v a2 s .c o m
        tokenStream = analyzer.tokenStream(ALL_TEXT_SUGGESTIONS, new StringReader(searchText));
        tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.addAttribute(OffsetAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            CharTermAttribute charTerm = tokenStream.getAttribute(CharTermAttribute.class);
            String text = charTerm.toString();
            if (StringUtils.isNotBlank(text)) {
                OffsetAttribute offset = tokenStream.getAttribute(OffsetAttribute.class);
                WordPosition word = new WordPosition(text, offset.startOffset(), offset.endOffset());
                words.add(word);
            }
        }
        tokenStream.end();
    } finally {
        Closeables.closeQuietly(tokenStream);
    }

    Collections.reverse(words);

    StringBuilder suggestedSearchText = new StringBuilder(searchText);
    StringBuilder suggestedSearchTextHtml = new StringBuilder(searchText);
    boolean foundSuggestions = false;
    String now = String.valueOf(System.currentTimeMillis());
    String startMarker = "__SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    String endMarker = "__/SUGGESTION-" + now + "__"; //$NON-NLS-1$ //$NON-NLS-2$
    DirectSpellChecker spellChecker = new DirectSpellChecker();
    IndexReader reader = searcher.getIndexReader();
    for (WordPosition word : words) {
        Term term = new Term(ALL_TEXT_SUGGESTIONS, word.getWord());
        SuggestWord[] suggestions = spellChecker.suggestSimilar(term, 1, reader,
                SuggestMode.SUGGEST_MORE_POPULAR);
        if (suggestions.length > 0) {
            String suggestedWord = suggestions[0].string;
            int start = word.getStart();
            int end = word.getEnd();
            suggestedSearchText.replace(start, end, suggestedWord);
            suggestedSearchTextHtml.replace(start, end,
                    startMarker + StringEscapeUtils.escapeHtml4(suggestedWord) + endMarker);

            foundSuggestions = true;
        }
    }

    if (foundSuggestions) {
        String suggestion = suggestedSearchText.toString();
        SearchResult suggestionResult = findPages(suggestion, 1, authentication, searcher);
        int suggestionTotalHits = suggestionResult.getTotalHits();
        if (suggestionTotalHits > 0) {
            String html = StringEscapeUtils.escapeHtml4(suggestedSearchTextHtml.toString())
                    .replaceAll(startMarker + "(.*?)" + endMarker, "<strong><em>$1</em></strong>"); //$NON-NLS-1$ //$NON-NLS-2$
            return new SearchTextSuggestion(suggestedSearchText.toString(), html, suggestionTotalHits);
        }
    }

    return null;
}

From source file:de.catma.indexer.PunctuationTokenizer.java

License:Open Source License

/**
 * Constructor.//from  w w w .  j a  v  a2  s  . c  o  m
 *
 * @param input the input stream
 * @param unseparableCharacterSequences the list of unseparable character sequences
 * @param userDefSeparatingPunctuationPattern a pattern of OR-ed user defined
 *  separating punctuation characters
 * @param locale the locale of the main language of the content
 */
public PunctuationTokenizer(TokenStream input, CharTree unseparableCharacterSequences,
        Pattern userDefSeparatingPunctuationPattern, Locale locale) {
    super(input);
    termInfoBuffer = new ArrayDeque<TermInfo>();
    offsetAtt = (OffsetAttribute) input.getAttribute(OffsetAttribute.class);
    termAtt = (CharTermAttribute) input.getAttribute(CharTermAttribute.class);
    ucAtt = (UnseparableCharacterSequenceAttribute) input
            .getAttribute(UnseparableCharacterSequenceAttribute.class);

    treeRoot = unseparableCharacterSequences;
    if (treeRoot == null) {
        treeRoot = CharTree.EMPTY_TREE;
    }
    this.userDefSeparatingPunctuationPattern = userDefSeparatingPunctuationPattern;
    this.locale = locale;
}

From source file:de.catma.indexer.TermExtractor.java

License:Open Source License

private void extractTermInfos(String content, List<String> unseparableCharacterSequences,
        List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException {

    terms = new HashMap<String, List<TermInfo>>();
    termsInOrder = new ArrayList<String>();

    if (locale == null) {
        locale = Locale.getDefault();
    }//from   w w w .  j  a va 2 s . c  om

    WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer(
            unseparableCharacterSequences, userDefinedSeparatingCharacters, locale);

    TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use
            // the fieldname
            new StringReader(content));

    int positionCounter = 0;
    while (ts.incrementToken()) {
        CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);

        TermInfo ti = new TermInfo(termAttr.toString(), offsetAttr.startOffset(), offsetAttr.endOffset(),
                positionCounter);

        if (!terms.containsKey(ti.getTerm())) {
            terms.put(ti.getTerm(), new ArrayList<TermInfo>());
        }
        terms.get(ti.getTerm()).add(ti);
        positionCounter++;

        termsInOrder.add(ti.getTerm());
    }
}

From source file:de.catma.indexer.WildcardTermExtractor.java

License:Open Source License

private void extractTermInfosWithWildcards(String content, List<String> unseparableCharacterSequences,
        List<Character> userDefinedSeparatingCharacters, Locale locale) throws IOException {

    if (locale == null) {
        locale = Locale.getDefault();
    }/*  w  w  w .  j a v a 2 s.  com*/

    WhitespaceAndPunctuationAnalyzer analyzer = new WhitespaceAndPunctuationAnalyzer(
            unseparableCharacterSequences, userDefinedSeparatingCharacters, locale);

    TokenStream ts = analyzer.tokenStream(null, // our analyzer does not use
            // the fieldname
            new StringReader(content));

    WildcardParser wildcardParser = new WildcardParser();
    while (ts.incrementToken()) {
        CharTermAttribute termAttr = (CharTermAttribute) ts.getAttribute(CharTermAttribute.class);

        OffsetAttribute offsetAttr = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class);
        wildcardParser.handle(termAttr, offsetAttr);
    }
    wildcardParser.finish();

    orderedTerms = new ArrayList<String>();
    for (TermInfo ti : wildcardParser.getOrderedTermInfos()) {
        orderedTerms.add(ti.getTerm());
    }

}

From source file:de.jetwick.es.JetwickQuery.java

License:Apache License

public Set<String> doSnowballStemming(TokenStream ts) {
    Set<String> res = new LinkedHashSet<String>();
    ts = new SnowballFilter(ts, "English");
    try {/*w  w w.  j  ava  2s .c o m*/
        while (ts.incrementToken()) {
            res.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException ex) {
        logger.error("Exception while stemming to snoball", ex);
    }

    return res;
}

From source file:de.mirkosertic.desktopsearch.SearchPhraseSuggester.java

License:Open Source License

private String analyze(String aFieldName, String aString) throws IOException {
    TokenStream theTokenStream = analyzer.tokenStream(aFieldName, aString);
    theTokenStream.reset();/*from   ww w .j  a  v  a  2 s  . co m*/
    CharTermAttribute theCharTerms = theTokenStream.getAttribute(CharTermAttribute.class);
    try {
        if (theTokenStream.incrementToken()) {
            return theCharTerms.toString();
        }
        return null;
    } finally {
        theTokenStream.end();
        theTokenStream.close();
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addWildcardOrTermQueries(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    Query theTempQuery;/*from   w  ww .j a v  a  2s .c  o  m*/

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        if (isWildcardTerm(aTerm)) {
            theTempQuery = new WildcardQuery(new Term(aField, getCorrectedWildcardTerm(aTerm)));
        } else {
            theTempQuery = new TermQuery(new Term(aField, theTokenText));
        }
        aQuery.add(theTempQuery, Occur.MUST);
    }
}

From source file:de.powerstaff.business.service.impl.GoogleStyleQueryParser.java

License:Open Source License

protected void addPhraseQuery(String aTerm, BooleanQuery aQuery, String aField, Analyzer aAnalyzer)
        throws IOException {

    MultiPhraseQuery thePhraseQuery = new MultiPhraseQuery();

    TokenStream theTokenStream = aAnalyzer.tokenStream(aField, new StringReader(aTerm));
    while (theTokenStream.incrementToken()) {

        TermAttribute theTermAttribute = theTokenStream.getAttribute(TermAttribute.class);

        String theTokenText = theTermAttribute.term();

        Term theTerm = new Term(aField, theTokenText);

        if (!isWildcardTerm(theTokenText)) {
            thePhraseQuery.add(theTerm);
        } else {/*  ww w  .ja v  a 2 s  .  c  o  m*/
            Term theWildcardTerm = new Term(theTerm.field(), getCorrectedWildcardTerm(theTerm.text()));
            WildcardTermEnum theEnum = new WildcardTermEnum(reader, theWildcardTerm);
            try {
                List<Term> theTerms = new ArrayList<Term>();
                do {
                    theTerms.add(theEnum.term());
                } while (theEnum.next());
                thePhraseQuery.add(theTerms.toArray(new Term[0]));
            } finally {
                theEnum.close();
            }
        }
    }

    aQuery.add(thePhraseQuery, Occur.MUST);
}

From source file:de.twitterlivesearch.analysis.Tokenizer.java

License:Apache License

/**
 * @param stringToAnalyze//from  w w w .j  av  a 2  s.co m
 *            String to be tokenized
 * @param {@link org.apache.lucene.analysis.Analyzer Analyzer} to be used
 *        for analysis
 *
 * @return list of tokens
 */
public static List<String> getTokensForString(String stringToAnalyze, Analyzer analyzer) {
    List<String> tokens = new ArrayList<String>();
    try {
        TokenStream stream = analyzer.tokenStream(null, new StringReader(stringToAnalyze));
        stream.reset();
        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return tokens;
}

From source file:de.unidue.inf.is.ezdl.dlcore.data.extractor.TermExtractor.java

License:Open Source License

/**
 * Split the information cause in sense of term it is a standalone word.
 * TODO this method removes stopwords but don't detect any phrases.
 * /*from w w w.j  a va  2 s  . co  m*/
 * @param result
 *            the list we will append the items
 * @param item
 *            the item itself.
 */
private void add(ExtractionResultImpl result, String item) {
    if (item != null) {
        inferLanguage(item);
        List<String> terms = new ArrayList<String>();

        TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_30, new StringReader(item));
        // OffsetAttribute offsetAttribute =
        // tokenStream.getAttribute(OffsetAttribute.class);
        TermAttribute termAttribute = tokenStream.getAttribute(TermAttribute.class);

        try {
            while (tokenStream.incrementToken()) {
                // int startOffset = offsetAttribute.startOffset();
                // int endOffset = offsetAttribute.endOffset();
                String term = termAttribute.term();
                terms.add(term);
            }
        } catch (IOException e) {
            logger.error(e.getMessage(), e);
        }

        terms = filter.filter(terms, locale);

        for (String t : terms) {
            if (!StringUtils.isEmpty((t))) {
                Entry e = new EntryImpl(t.toLowerCase(locale));
                result.add(e);
            }
        }
    }
}