Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.karsha.base.DocIndexer.java

License:Open Source License

/**
 * This method handles the Lemmatization of given text using
 * EnglishLemmaAnalyzer//from  w w w .  j  av  a2 s  .  c  o m
 *
 * @param text
 * @param tagger- should supply a Stanford parser "MaxentTagger" object
 * @return- Lemmatized text
 * @throws IOException
 * @throws ClassNotFoundException
 */
public String analyze(String text, MaxentTagger tagger) throws IOException, ClassNotFoundException {
    // System.out.println("Analzying "" + text + """);
    //MaxentTagger tagger = new MaxentTagger("tagger/bidirectional-distsim-wsj-0-18.tagger");

    Analyzer analyzer = new EnglishLemmaAnalyzer(tagger);
    //System.out.println("\t" + analyzer.getClass().getName() + ":");
    //System.out.print("\t\t");
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
    String term = null;
    while (stream.incrementToken()) {
        // stream.
        if (stream.incrementToken()) {
            term = term + " " + termAttribute.term();
            //                Token token = stream.next();
            //                if (token == null) break;

            // System.out.print("[" + term + "] \n");
        }
    }
    stream.clearAttributes();
    //System.out.println("\n");
    return term;
}

From source file:org.karsha.tokenize.DefaultTokenizer.java

License:Open Source License

public String processText(String text) {
    StringBuffer str = new StringBuffer();
    TokenStream stream = tokenStream(new StringReader(text));
    Token token = new Token();

    try {/*from   w  w w . j  a v  a2s  . c  o  m*/

        while (stream.incrementToken()) {
            str.append(stream.getAttribute(TermAttribute.class).term());
            str.append(" ");

        }

    } catch (Exception e) {
        e.printStackTrace();
    }

    return str.toString();
}

From source file:org.karsha.tokenize.SimpleTokenizer.java

License:Open Source License

public String processText(String text) {
    StringBuffer str = new StringBuffer();
    TokenStream stream = tokenStream(new StringReader(text));
    Token token = new Token();

    try {//from w  w  w.ja  v a2  s  .c o  m

        while (stream.incrementToken()) {
            str.append(stream.getAttribute(TermAttribute.class).term());
            str.append(" ");

        }
        //            while ((token = stream.next(token)) != null) {
        //                str.append(token.termBuffer(), 0, token.termLength());
        //                str.append(" ");
        //            }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //return str.toString().replace('-', ' ').trim();

    return str.toString();
}

From source file:org.meresco.lucene.suggestion.SuggestionIndex.java

License:Open Source License

public List<String> shingles(String s) throws IOException {
    List<String> shingles = new ArrayList<String>();
    TokenStream stream = this.shingleAnalyzer.tokenStream("ignored", s);
    stream.reset();//w  ww  . j  a v  a  2  s.  c  om
    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        shingles.add(termAttribute.toString());
    }
    stream.close();
    return shingles;
}

From source file:org.meresco.lucene.suggestion.SuggestionNGramIndex.java

License:Open Source License

public static List<String> ngrams(String s, Boolean trigram) throws IOException {
    List<String> ngram = new ArrayList<String>();
    Analyzer ngramAnalyzer = trigram ? TRIGRAM_ANALYZER : BIGRAM_ANALYZER;
    TokenStream stream = ngramAnalyzer.tokenStream("ignored", s);
    stream.reset();//from w  w  w .j a  va2s  .c  om
    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        ngram.add(termAttribute.toString());
    }
    stream.close();
    return ngram;
}

From source file:org.neo4j.index.lucene.LuceneFulltextIndexService.java

License:Open Source License

@Override
protected Query formQuery(String key, Object value, Object matching) {
    if (matching == MatchingType.EXACT) {
        return new TermQuery(new Term(DOC_INDEX_SOURCE_KEY, value.toString()));
    }/*www  .j a  v a  2  s. com*/

    TokenStream stream = LuceneFulltextDataSource.LOWER_CASE_WHITESPACE_ANALYZER.tokenStream(DOC_INDEX_KEY,
            new StringReader(value.toString().toLowerCase()));
    BooleanQuery booleanQuery = new BooleanQuery();
    try {
        while (stream.incrementToken()) {
            String term = stream.getAttribute(TermAttribute.class).term();
            booleanQuery.add(new TermQuery(new Term(DOC_INDEX_KEY, term)), Occur.MUST);
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return booleanQuery;
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
        final boolean filterDigits, final boolean filterWhitespace) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }//from  w  w  w.  j a  v a 2 s.co m

    List<Token> result = new ArrayList<Token>(64);

    final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
        boolean filterDigits, boolean filterWhitespace) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }/*from w ww  . j  av  a 2 s .  co m*/

    List<String> result = new ArrayList<String>(64);

    final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    Locale loc = stemsAllowed ? getLanguage().getLocale() : null;

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(tokenText);
                if (stemsAllowed) {
                    String origText = str.substring(off.startOffset(), off.endOffset());
                    if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                        result.add(origText);
                    }
                }
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new String[result.size()]);
}

From source file:org.openedit.data.lucene.AnalyzingQueryParserWithStop.java

License:Apache License

/**
 * Returns the analyzed form for the given chunk
 * /*from w w w  .  ja v  a  2 s .  c o m*/
 * If the analyzer produces more than one output token from the given chunk,
 * a ParseException is thrown.
 *
 * @param field The target field
 * @param termStr The full term from which the given chunk is excerpted
 * @param chunk The portion of the given termStr to be analyzed
 * @return The result of analyzing the given chunk
 * @throws ParseException when analysis returns other than one output token
 */
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException {
    String analyzed = null;
    TokenStream stream = null;
    try {
        stream = getAnalyzer().tokenStream(field, chunk);
        stream.reset();
        CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
        // get first and hopefully only output token
        if (stream.incrementToken()) {
            analyzed = termAtt.toString();

            // try to increment again, there should only be one output token
            StringBuilder multipleOutputs = null;
            while (stream.incrementToken()) {
                if (null == multipleOutputs) {
                    multipleOutputs = new StringBuilder();
                    multipleOutputs.append('"');
                    multipleOutputs.append(analyzed);
                    multipleOutputs.append('"');
                }
                multipleOutputs.append(',');
                multipleOutputs.append('"');
                multipleOutputs.append(termAtt.toString());
                multipleOutputs.append('"');
            }
            stream.end();
            if (null != multipleOutputs) {
                throw new ParseException(String.format(getLocale(),
                        "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
            }
        } else {
            // nothing returned by analyzer.  Was it a stop word and the user accidentally
            // used an analyzer with stop words?
            stream.end();
            //Need to just ignore this
            return null;
            //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
        }
    } catch (IOException e) {
        throw new ParseException(
                String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return analyzed;
}

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public Query createQuery(String phrase) {

    phrase = phrase.trim();//from w  w w .j a  v a 2s  .  c o  m

    TermQuery idQuery = new TermQuery(new Term("protein-id", phrase.toUpperCase()));
    TermQuery accQuery = new TermQuery(new Term("accession", phrase.toUpperCase()));

    ArrayList<TermQuery> descQueries = new ArrayList<TermQuery>();
    ArrayList<Term> descTerms = new ArrayList<Term>();

    TokenStream stream = analyzer.tokenStream("description", new StringReader(phrase));
    TermAttribute attr = (TermAttribute) stream.getAttribute(TermAttribute.class);
    try {
        stream.reset();
        Term lastTerm = null;

        while (stream.incrementToken()) {
            Term t = new Term("description", attr.term());
            descQueries.add(new TermQuery(t));
            descTerms.add(t);

            if (lastTerm != null) {
                Term hyph = new Term("description", lastTerm.text() + "-" + t.text());
                descQueries.add(new TermQuery(hyph));
                //descTerms.add(hyph);
            }
            lastTerm = t;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    return createDisjunction(2.0f, createMust(idQuery), createMust(accQuery),
            //createShould(descQueries.toArray(new Query[0])));
            createDescendingQuery(descTerms.toArray(new Term[0])));
}