Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {/*from   w  w w  .ja v  a2 s. c  o m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.BiothesaurusSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*/*w ww. j a  va2s  .  c o  m*/
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        //stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public String[] tokenize(String input) {
    ArrayList<String> tokens = new ArrayList<String>();
    try {/*from ww  w .  j  a  v  a  2  s  .com*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(input));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                String term = termattr.term();
                tokens.add(term);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace(System.err);
    } catch (IOException e) {
        System.err.println(String.format("Phrase: \"%s\"", input));
        e.printStackTrace();
    }

    return tokens.toArray(new String[0]);
}

From source file:org.sc.probro.lucene.ProteinSearcher.java

License:Apache License

public Query createPhraseQuery(String field, String phrase) throws IOException {
    PhraseQuery query = new PhraseQuery();
    /*//from  www.j a  v  a 2  s . c om
    String[] array = phrase.split("\\s+");
    for(int i = 0; i < array.length; i++) { 
       query.add(new Term(field, array[i]));
    }
    */

    try {
        TokenStream stream = analyzer.tokenStream(field, new StringReader(phrase));
        stream = new LowerCaseFilter(stream);

        stream.reset();

        while (stream.incrementToken()) {
            if (stream.hasAttribute(TermAttribute.class)) {
                TermAttribute termattr = (TermAttribute) stream.getAttribute(TermAttribute.class);
                Term t = new Term(field, termattr.term());
                query.add(t);
            }
        }

        stream.end();
        stream.close();

    } catch (IllegalArgumentException e) {
        e.printStackTrace(System.err);
        System.err.println(String.format("Phrase: \"%s\"", phrase));
    }

    return query;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Split the string into tokens using the given analyzer.
 *//*from  w  ww.ja  v  a 2  s. co m*/
public static final List<String> getTokenTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<String> result = new ArrayList<String>();

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    result.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        result.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

public static final List<List<String>> getPhraseTexts(Analyzer analyzer, String fieldName, String string) {
    if (string == null)
        return null;

    final List<List<String>> result = new LinkedList<List<String>>();
    List<String> curPhrase = new ArrayList<String>();
    result.add(curPhrase);/*ww w.  j a va2 s  . c o  m*/

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                boolean incPhrase = true;
                if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                    final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                            .getAttribute(OffsetAttribute.class);
                    if (offsetAttribute.startOffset() == lastEndOffset) {
                        incPhrase = false;
                    }
                    lastEndOffset = offsetAttribute.endOffset();
                }

                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    if (incPhrase && curPhrase.size() > 0) {
                        curPhrase = new ArrayList<String>();
                        result.add(curPhrase);
                    }

                    curPhrase.add(termAttribute.term());
                }
            }
            tokenStream.close();
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }
    } else {
        curPhrase.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.LuceneUtils.java

License:Open Source License

/**
 * Build a phrase query from the tokens in the given string using the given
 * analyzer.//  w w w . j  av  a  2 s  .c om
 * <p>
 * Use a BooleanClause.Occur.MUST for exact matches and BooleanClause.Occur.SHOULD
 * for fuzzy matches.
 */
public static final Query toQuery(Analyzer analyzer, String fieldName, String string,
        Collection<String> termCollector, BooleanClause.Occur occur) {
    Query result = null;

    if (analyzer != null) {
        final TokenStream tokenStream = analyzer.tokenStream(fieldName, new StringReader(string));

        BooleanQuery booleanQuery = null;
        PhraseQuery phraseQuery = null;
        int lastEndOffset = 0;

        try {
            while (tokenStream.incrementToken()) {
                if (tokenStream.hasAttribute(TermAttribute.class)) {
                    final TermAttribute termAttribute = (TermAttribute) tokenStream
                            .getAttribute(TermAttribute.class);
                    final String term = termAttribute.term();

                    // check offset attribute
                    if (tokenStream.hasAttribute(OffsetAttribute.class)) {
                        final OffsetAttribute offsetAttribute = (OffsetAttribute) tokenStream
                                .getAttribute(OffsetAttribute.class);
                        if (offsetAttribute.startOffset() != lastEndOffset) {
                            // time to increment phrase
                            if (phraseQuery != null) {
                                if (booleanQuery == null)
                                    booleanQuery = new BooleanQuery();
                                booleanQuery.add(phraseQuery, occur);
                                phraseQuery = null;
                            }
                        }
                        lastEndOffset = offsetAttribute.endOffset();
                    }

                    if (phraseQuery == null)
                        phraseQuery = new PhraseQuery();
                    phraseQuery.add(new Term(fieldName, term));
                    if (termCollector != null)
                        termCollector.add(term);
                }
            }
        } catch (IOException e) {
            throw new IllegalStateException(e);
        }

        if (phraseQuery != null) {
            if (booleanQuery == null)
                booleanQuery = new BooleanQuery();
            booleanQuery.add(phraseQuery, BooleanClause.Occur.SHOULD);
        }
        result = booleanQuery;
    }

    if (result == null) {
        result = new TermQuery(new Term(fieldName, string));
        if (termCollector != null)
            termCollector.add(string);
    }

    return result;
}

From source file:org.sd.text.lucene.TestSdTokenStream.java

License:Open Source License

private final void verifyTokens(TokenStream tokenStream, String[] expected) throws IOException {
    int index = 0;
    Token token = new Token();
    while (tokenStream.incrementToken()) {
        final TermAttribute termAttribute = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        final String term = termAttribute.term();
        if (expected != null) {
            assertTrue("got " + term + " at index=" + index, index < expected.length);
            assertEquals("got " + term + " at index=" + index, expected[index], term);
        } else {/*from  ww  w.  java2 s  .co m*/
            System.out.println(index + ":" + term);
        }

        ++index;
    }
    if (expected != null) {
        assertEquals(expected.length, index);
    }
}

From source file:org.sift.runtime.impl.LuceneWordSplitterProcessor.java

License:Apache License

/**
 * Interface method implementation. Emits words as {@link Tuple} values by applying the configured Lucene {@link Analyzer} on specified {@link Tuple} values
 * @see org.sift.runtime.spi.Processor#process(org.sift.runtime.Tuple, org.sift.runtime.spi.OutputCollector)
 */// w  ww  .  j  a v a2 s.c om
public void process(Tuple tuple, OutputCollector collector) {
    Tuple returnTuple = tuple.clone();
    for (Object line : tuple.getList(Fields.VALUES)) {
        List<String> tokensList = new LinkedList<String>();
        try {
            TokenStream stream = this.analyzer.tokenStream(null,
                    new StringReader(((String) line).toLowerCase()));
            while (stream.incrementToken()) {
                tokensList.add(((TermAttribute) stream.getAttribute(TermAttribute.class)).term());
            }
        } catch (IOException e) {
            throw new RuntimeException("Error parsing input line : " + line, e);
        }
        String[] tokens = tokensList.toArray(new String[0]);
        for (int i = 0; i < tokens.length; i++) {
            StringBuffer tokenBuffer = new StringBuffer();
            for (int j = 0; j < this.getnGram(); j++) {
                if (i + j < tokens.length) {
                    tokenBuffer.append(tokens[i + j]);
                    tokenBuffer.append(StopWords.WORD_BOUNDARY_STRING);
                }
                String word = tokenBuffer.toString().trim();
                if (this.getStopWords() != null && !this.getStopWords().isStopWord(word)) {
                    returnTuple.addToList(Fields.VALUES, tokenBuffer.toString().trim());
                }
            }
        }
    }
    collector.emit(returnTuple);
}

From source file:org.sindice.siren.analysis.filter.URILocalnameFilter.java

License:Apache License

/**
 * For testing purpose// w  w  w  .j  a v  a 2 s.com
 */
public static void main(final String[] args) throws IOException {
    final TupleTokenizer stream = new TupleTokenizer(
            new StringReader("" + "<mailto:renaud.delbru@deri.org> <http://renaud.delbru.fr/rdf/foaf> "
                    + "<http://renaud.delbru.fr/>  <http://xmlns.com/foaf/0.1/workplaceHomepage> "
                    + "<http://test.com/M%C3%B6ller>"),
            Integer.MAX_VALUE, new WhitespaceAnalyzer(Version.LUCENE_31));
    final TokenStream result = new URILocalnameFilter(stream);
    final CharTermAttribute termAtt = result.getAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = result.getAttribute(PositionIncrementAttribute.class);
    while (result.incrementToken()) {
        System.out.println(termAtt.toString() + ", " + posIncrAtt.getPositionIncrement());
    }
}