Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.karsha.base.DocIndexer.java

License:Open Source License

/**
 * This method handles the Lemmatization of given text using
 * EnglishLemmaAnalyzer//from  w w w .  j  av  a2 s  .  c  o m
 *
 * @param text
 * @param tagger- should supply a Stanford parser "MaxentTagger" object
 * @return- Lemmatized text
 * @throws IOException
 * @throws ClassNotFoundException
 */
public String analyze(String text, MaxentTagger tagger) throws IOException, ClassNotFoundException {
    // System.out.println("Analzying "" + text + """);
    //MaxentTagger tagger = new MaxentTagger("tagger/bidirectional-distsim-wsj-0-18.tagger");

    Analyzer analyzer = new EnglishLemmaAnalyzer(tagger);
    //System.out.println("\t" + analyzer.getClass().getName() + ":");
    //System.out.print("\t\t");
    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));
    TermAttribute termAttribute = stream.getAttribute(TermAttribute.class);
    String term = null;
    while (stream.incrementToken()) {
        // stream.
        if (stream.incrementToken()) {
            term = term + " " + termAttribute.term();
            //                Token token = stream.next();
            //                if (token == null) break;

            // System.out.print("[" + term + "] \n");
        }
    }
    stream.clearAttributes();
    //System.out.println("\n");
    return term;
}

From source file:org.karsha.tokenize.DefaultTokenizer.java

License:Open Source License

public String processText(String text) {
    StringBuffer str = new StringBuffer();
    TokenStream stream = tokenStream(new StringReader(text));
    Token token = new Token();

    try {/*from   w  w w . j  a v  a2s  . c  o  m*/

        while (stream.incrementToken()) {
            str.append(stream.getAttribute(TermAttribute.class).term());
            str.append(" ");

        }

    } catch (Exception e) {
        e.printStackTrace();
    }

    return str.toString();
}

From source file:org.karsha.tokenize.SimpleTokenizer.java

License:Open Source License

public String processText(String text) {
    StringBuffer str = new StringBuffer();
    TokenStream stream = tokenStream(new StringReader(text));
    Token token = new Token();

    try {//from w  w  w.ja  v a2  s  .c o  m

        while (stream.incrementToken()) {
            str.append(stream.getAttribute(TermAttribute.class).term());
            str.append(" ");

        }
        //            while ((token = stream.next(token)) != null) {
        //                str.append(token.termBuffer(), 0, token.termLength());
        //                str.append(" ");
        //            }
    } catch (Exception e) {
        e.printStackTrace();
    }

    //return str.toString().replace('-', ' ').trim();

    return str.toString();
}

From source file:org.meresco.lucene.suggestion.SuggestionIndex.java

License:Open Source License

public List<String> shingles(String s) throws IOException {
    List<String> shingles = new ArrayList<String>();
    TokenStream stream = this.shingleAnalyzer.tokenStream("ignored", s);
    stream.reset();//w  ww  . j  a v  a  2  s.  c  om
    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        shingles.add(termAttribute.toString());
    }
    stream.close();
    return shingles;
}

From source file:org.meresco.lucene.suggestion.SuggestionNGramIndex.java

License:Open Source License

public static List<String> ngrams(String s, Boolean trigram) throws IOException {
    List<String> ngram = new ArrayList<String>();
    Analyzer ngramAnalyzer = trigram ? TRIGRAM_ANALYZER : BIGRAM_ANALYZER;
    TokenStream stream = ngramAnalyzer.tokenStream("ignored", s);
    stream.reset();//from w  w  w .j a  va2s  .c  om
    CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        ngram.add(termAttribute.toString());
    }
    stream.close();
    return ngram;
}

From source file:org.neo4j.index.lucene.LuceneFulltextIndexService.java

License:Open Source License

@Override
protected Query formQuery(String key, Object value, Object matching) {
    if (matching == MatchingType.EXACT) {
        return new TermQuery(new Term(DOC_INDEX_SOURCE_KEY, value.toString()));
    }/*www  .j a  v a  2  s. com*/

    TokenStream stream = LuceneFulltextDataSource.LOWER_CASE_WHITESPACE_ANALYZER.tokenStream(DOC_INDEX_KEY,
            new StringReader(value.toString().toLowerCase()));
    BooleanQuery booleanQuery = new BooleanQuery();
    try {
        while (stream.incrementToken()) {
            String term = stream.getAttribute(TermAttribute.class).term();
            booleanQuery.add(new TermQuery(new Term(DOC_INDEX_KEY, term)), Occur.MUST);
        }
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return booleanQuery;
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected Token[] tokenize(final String strOrig, final boolean stemsAllowed, final boolean stopWordsAllowed,
        final boolean filterDigits, final boolean filterWhitespace) {
    if (StringUtil.isEmpty(strOrig)) {
        return EMPTY_TOKENS_LIST;
    }//from  w  w  w.  j a  v a 2 s.co m

    List<Token> result = new ArrayList<Token>(64);

    final TokenStream in = getTokenStream(strOrig, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(new Token(tokenText, off.startOffset(), off.endOffset() - off.startOffset()));
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new Token[result.size()]);
}

From source file:org.omegat.tokenizer.BaseTokenizer.java

License:Open Source License

protected String[] tokenizeToStrings(String str, boolean stemsAllowed, boolean stopWordsAllowed,
        boolean filterDigits, boolean filterWhitespace) {
    if (StringUtil.isEmpty(str)) {
        return EMPTY_STRING_LIST;
    }/*from w ww  . j  av  a 2 s .  co m*/

    List<String> result = new ArrayList<String>(64);

    final TokenStream in = getTokenStream(str, stemsAllowed, stopWordsAllowed);
    in.addAttribute(CharTermAttribute.class);
    in.addAttribute(OffsetAttribute.class);

    CharTermAttribute cattr = in.getAttribute(CharTermAttribute.class);
    OffsetAttribute off = in.getAttribute(OffsetAttribute.class);

    Locale loc = stemsAllowed ? getLanguage().getLocale() : null;

    try {
        in.reset();
        while (in.incrementToken()) {
            String tokenText = cattr.toString();
            if (acceptToken(tokenText, filterDigits, filterWhitespace)) {
                result.add(tokenText);
                if (stemsAllowed) {
                    String origText = str.substring(off.startOffset(), off.endOffset());
                    if (!origText.toLowerCase(loc).equals(tokenText.toLowerCase(loc))) {
                        result.add(origText);
                    }
                }
            }
        }
        in.end();
        in.close();
    } catch (IOException ex) {
        // shouldn't happen
    }
    return result.toArray(new String[result.size()]);
}

From source file:org.openedit.data.lucene.AnalyzingQueryParserWithStop.java

License:Apache License

/**
 * Returns the analyzed form for the given chunk
 * /*from w w w  .  ja v  a  2 s .  c o m*/
 * If the analyzer produces more than one output token from the given chunk,
 * a ParseException is thrown.
 *
 * @param field The target field
 * @param termStr The full term from which the given chunk is excerpted
 * @param chunk The portion of the given termStr to be analyzed
 * @return The result of analyzing the given chunk
 * @throws ParseException when analysis returns other than one output token
 */
protected String analyzeSingleChunk(String field, String termStr, String chunk) throws ParseException {
    String analyzed = null;
    TokenStream stream = null;
    try {
        stream = getAnalyzer().tokenStream(field, chunk);
        stream.reset();
        CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
        // get first and hopefully only output token
        if (stream.incrementToken()) {
            analyzed = termAtt.toString();

            // try to increment again, there should only be one output token
            StringBuilder multipleOutputs = null;
            while (stream.incrementToken()) {
                if (null == multipleOutputs) {
                    multipleOutputs = new StringBuilder();
                    multipleOutputs.append('"');
                    multipleOutputs.append(analyzed);
                    multipleOutputs.append('"');
                }
                multipleOutputs.append(',');
                multipleOutputs.append('"');
                multipleOutputs.append(termAtt.toString());
                multipleOutputs.append('"');
            }
            stream.end();
            if (null != multipleOutputs) {
                throw new ParseException(String.format(getLocale(),
                        "Analyzer created multiple terms for \"%s\": %s", chunk, multipleOutputs.toString()));
            }
        } else {
            // nothing returned by analyzer.  Was it a stop word and the user accidentally
            // used an analyzer with stop words?
            stream.end();
            //Need to just ignore this
            return null;
            //throw new ParseException(String.format(getLocale(), "Analyzer returned nothing for \"%s\"", chunk));
        }
    } catch (IOException e) {
        throw new ParseException(
                String.format(getLocale(), "IO error while trying to analyze single term: \"%s\"", termStr));
    } finally {
        IOUtils.closeWhileHandlingException(stream);
    }
    return analyzed;
}

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public Query createQuery(String phrase) {

    phrase = phrase.trim();//from w  w w .j a  v a 2s  .  c o  m

    TermQuery idQuery = new TermQuery(new Term("protein-id", phrase.toUpperCase()));
    TermQuery accQuery = new TermQuery(new Term("accession", phrase.toUpperCase()));

    ArrayList<TermQuery> descQueries = new ArrayList<TermQuery>();
    ArrayList<Term> descTerms = new ArrayList<Term>();

    TokenStream stream = analyzer.tokenStream("description", new StringReader(phrase));
    TermAttribute attr = (TermAttribute) stream.getAttribute(TermAttribute.class);
    try {
        stream.reset();
        Term lastTerm = null;

        while (stream.incrementToken()) {
            Term t = new Term("description", attr.term());
            descQueries.add(new TermQuery(t));
            descTerms.add(t);

            if (lastTerm != null) {
                Term hyph = new Term("description", lastTerm.text() + "-" + t.text());
                descQueries.add(new TermQuery(hyph));
                //descTerms.add(hyph);
            }
            lastTerm = t;
        }
    } catch (IOException e) {
        e.printStackTrace();
    }

    return createDisjunction(2.0f, createMust(idQuery), createMust(accQuery),
            //createShould(descQueries.toArray(new Query[0])));
            createDescendingQuery(descTerms.toArray(new Term[0])));
}