Example usage for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass)

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {/*from   w  w w  .ja v  a2 s. c  o m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*/*w ww. j a  va2s  .  c o  m*/
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {/*from ww  w .  j  a  v  a  2  s  .com*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*//from  www.j a  v  a 2  s . c om
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Split the string into tokens using the given analyzer.
 *//*from  w  ww.ja  v  a 2  s. co m*/
public static final List<String> getTokenTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<String> result = new ArrayList<String>();

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    result.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        result.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

public static final List<List<String>> getPhraseTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<List<String>> result = new LinkedList<List<String>>();
    List<String> curPhrase = new ArrayList<String>();
    result.add(curPhrase);/*ww w.  j a va2 s  . c o  m*/

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                boolean incPhrase = true;
                if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                    final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                            .getAttribute(OffsetAttribute.class);
                    if (offsetAttribute.startOffset() == lastEndOffset) {
                        incPhrase = false;
                    }
                    lastEndOffset = offsetAttribute.endOffset();
                }

                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    if (incPhrase && curPhrase.size() > 0) {
                        curPhrase = new ArrayList<String>();
                        result.add(curPhrase);
                    }

                    curPhrase.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        curPhrase.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Build a phrase query from the tokens in the given string using the given
 * analyzer.//  w w w . j  av  a  2 s  .c om
 * <p>
 * Use a BooleanClause.Occur.MUST for exact matches and BooleanClause.Occur.SHOULD
 * for fuzzy matches.
 */
public static final Query toQuery(Analyzer analyzer, String fieldName, String string,
        Collection<String> termCollector, BooleanClause.Occur occur) {
    Query result = null;

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        BooleanQuery booleanQuery = null;
        PhraseQuery phraseQuery = null;
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    final String term = termAttribute.term();

                    // check offset attribute
                    if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                        final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                                .getAttribute(OffsetAttribute.class);
                        if (offsetAttribute.startOffset() != lastEndOffset) {
                            // time to increment phrase
                            if (phraseQuery != null) {
                                if (booleanQuery == null)
                                    booleanQuery = new BooleanQuery();
                                booleanQuery.add(phraseQuery, occur);
                                phraseQuery = null;
                            }
                        }
                        lastEndOffset = offsetAttribute.endOffset();
                    }

                    if (phraseQuery == null)
                        phraseQuery = new PhraseQuery();
                    phraseQuery.add(new Term(fieldName, term));
                    if (termCollector != null)
                        termCollector.add(term);
                }
            }
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }

        if (phraseQuery != null) {
            if (booleanQuery == null)
                booleanQuery = new BooleanQuery();
            booleanQuery.add(phraseQuery, BooleanClause.Occur.SHOULD);
        }
        result = booleanQuery;
    }

    if (result == null) {
        result = new TermQuery(new Term(fieldName, string));
        if (termCollector != null)
            termCollector.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.TestSdTokenStream.java

License:Open Source License

private final void verifyTokens(TokenStream tokenStream, String[] expected) throws IOException {
    int index = 0;
    Token token = new Token();
    while (tokenStream.incrementToken()) {
        final TermAttribute termAttribute = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        final String term = termAttribute.term();
        if (expected != null) {
            assertTrue("got " + term + " at index=" + index, index < expected.length);
            assertEquals("got " + term + " at index=" + index, expected[index], term);
        } else {/*from  ww  w.  java2 s  .co m*/
            System.out.println(index + ":" + term);
        }

        ++index;
    }
    if (expected != null) {
        assertEquals(expected.length, index);
    }
}

From source file:org.sift.runtime.impl.LuceneWordSplitterProcessor.java

License:Apache License

/**
 * Interface method implementation. Emits words as {@link Tuple} values by applying the configured Lucene {@link Analyzer} on specified {@link Tuple} values
 * @see org.sift.runtime.spi.Processor#process(org.sift.runtime.Tuple, org.sift.runtime.spi.OutputCollector)
 */// w  ww  .  j  a v a2 s.c om
public void process(Tuple tuple, OutputCollector collector) {
    Tuple returnTuple = tuple.clone();
    for (Object line : tuple.getList(Fields.VALUES)) {
        List<String> tokensList = new LinkedList<String>();
        try {
            TokenStream stream = this.analyzer.tokenStream(null,
                    new StringReader(((String) line).toLowerCase()));
            while (stream.incrementToken()) {
                tokensList.add(((TermAttribute) stream.getAttribute(TermAttribute.class)).term());
            }
        } catch (IOException e) {
            throw new RuntimeException("Error parsing input line : " + line, e);
        }
        String[] tokens = tokensList.toArray(new String[0]);
        for (int i = 0; i < tokens.length; i++) {
            StringBuffer tokenBuffer = new StringBuffer();
            for (int j = 0; j < this.getnGram(); j++) {
                if (i + j < tokens.length) {
                    tokenBuffer.append(tokens[i + j]);
                    tokenBuffer.append(StopWords.WORD_BOUNDARY_STRING);
                }
                String word = tokenBuffer.toString().trim();
                if (this.getStopWords() != null && !this.getStopWords().isStopWord(word)) {
                    returnTuple.addToList(Fields.VALUES, tokenBuffer.toString().trim());
                }
            }
        }
    }
    collector.emit(returnTuple);
}

From source file:org.sindice.siren.analysis.filter.URILocalnameFilter.java

License:Apache License

/**
 * For testing purpose// w  w  w  .j  a v  a 2 s.com
 */
public static void main(final String[] args) throws IOException {
    final TupleTokenizer stream = new TupleTokenizer(
            new StringReader("" + "<mailto:renaud.delbru@deri.org> <http://renaud.delbru.fr/rdf/foaf> "
                    + "<http://renaud.delbru.fr/>  <http://xmlns.com/foaf/0.1/workplaceHomepage> "
                    + "<http://test.com/M%C3%B6ller>"),
            Integer.MAX_VALUE, new WhitespaceAnalyzer(Version.LUCENE_31));
    final TokenStream result = new URILocalnameFilter(stream);
    final CharTermAttribute termAtt = result.getAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = result.getAttribute(PositionIncrementAttribute.class);
    while (result.incrementToken()) {
        System.out.println(termAtt.toString() + ", " + posIncrAtt.getPositionIncrement());
    }
}