Example usage for org.apache.lucene.analysis TokenStream getAttribute

List of usage examples for org.apache.lucene.analysis TokenStream getAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream getAttribute.

Prototype

public final <T extends Attribute> T getAttribute(Class<T> attClass) 

Source Link

Document

Returns the instance of the passed in Attribute contained in this AttributeSource

The caller must pass in a Class<?

Usage

From source file:org.sindice.siren.analysis.filter.URINormalisationFilter.java

License:Apache License

/**
 * For testing purpose//  w ww.ja  v  a2s .c  o  m
 */
public static void main(final String[] args) throws IOException {
    final TupleTokenizer stream = new TupleTokenizer(
            new StringReader("" + "<mailto:renaud.delbru@deri.org> <http://renaud.delbru.fr/rdf/foaf> "
                    + "<http://renaud.delbru.fr/>  <http://xmlns.com/foaf/0.1/workplaceHomepage/>"),
            Integer.MAX_VALUE, new WhitespaceAnalyzer(Version.LUCENE_31));
    final TokenStream result = new URINormalisationFilter(stream);
    while (result.incrementToken()) {
        final CharTermAttribute termAtt = result.getAttribute(CharTermAttribute.class);
        final PositionIncrementAttribute posIncrAtt = result.getAttribute(PositionIncrementAttribute.class);
        System.out.println(termAtt.toString() + ", " + posIncrAtt.getPositionIncrement());
    }
}

From source file:org.sindice.siren.analysis.TestTupleAnalyzer.java

License:Apache License

public void assertAnalyzesTo(final Analyzer a, final String input, final String[] expectedImages,
        final String[] expectedTypes, final int[] expectedPosIncrs, final int[] expectedTupleID,
        final int[] expectedCellID) throws Exception {
    final TokenStream t = a.reusableTokenStream("", new StringReader(input));

    assertTrue("has TermAttribute", t.hasAttribute(TermAttribute.class));
    final TermAttribute termAtt = t.getAttribute(TermAttribute.class);

    TypeAttribute typeAtt = null;/*from   ww  w.j av a 2  s  . co  m*/
    if (expectedTypes != null) {
        assertTrue("has TypeAttribute", t.hasAttribute(TypeAttribute.class));
        typeAtt = t.getAttribute(TypeAttribute.class);
    }

    PositionIncrementAttribute posIncrAtt = null;
    if (expectedPosIncrs != null) {
        assertTrue("has PositionIncrementAttribute", t.hasAttribute(PositionIncrementAttribute.class));
        posIncrAtt = t.getAttribute(PositionIncrementAttribute.class);
    }

    TupleAttribute tupleAtt = null;
    if (expectedTupleID != null) {
        assertTrue("has TupleAttribute", t.hasAttribute(TupleAttribute.class));
        tupleAtt = t.getAttribute(TupleAttribute.class);
    }

    CellAttribute cellAtt = null;
    if (expectedCellID != null) {
        assertTrue("has CellAttribute", t.hasAttribute(CellAttribute.class));
        cellAtt = t.getAttribute(CellAttribute.class);
    }

    for (int i = 0; i < expectedImages.length; i++) {

        assertTrue("token " + i + " exists", t.incrementToken());

        assertEquals(expectedImages[i], termAtt.term());

        if (expectedTypes != null) {
            assertEquals(expectedTypes[i], typeAtt.type());
        }

        if (expectedPosIncrs != null) {
            assertEquals(expectedPosIncrs[i], posIncrAtt.getPositionIncrement());
        }

        if (expectedTupleID != null) {
            assertEquals(expectedTupleID[i], tupleAtt.tuple());
        }

        if (expectedCellID != null) {
            assertEquals(expectedCellID[i], cellAtt.cell());
        }
    }

    assertFalse("end of stream", t.incrementToken());
    t.end();
    t.close();
}

From source file:org.sindice.siren.qparser.analysis.filter.QNamesFilter.java

License:Apache License

public QNamesFilter(final TokenStream input, final String path) {
    super(input);
    cTermAtt = input.getAttribute(CharTermAttribute.class);
    try {/*w w w. j a va2 s. co  m*/
        qnames.load(new FileInputStream(path));
    } catch (final FileNotFoundException e) {
        logger.error("QNames mapping file not found", e);
        throw new RuntimeException("QNames mapping file not found", e);
    } catch (final IOException e) {
        logger.error("Parsing of the QNames mapping file failed", e);
        throw new RuntimeException("Parsing of the QNames mapping file failed", e);
    }
    logger.debug("Loading QNames mapping file located at {}", path);
}

From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java

License:Apache License

private void processFieldEntry(String fieldName, String s, CharArraySet set) throws IOException {
    TokenStream ts = analyzer.tokenStream(fieldName, s);
    CharTermAttribute cattr = ts.getAttribute(CharTermAttribute.class);
    ts.reset();//from  ww  w .  j  av  a 2 s.c  om
    while (ts.incrementToken()) {
        set.add(cattr.toString());
    }
    ts.end();
    ts.close();
}

From source file:org.tallison.lucene.search.concordance.charoffsets.ReanalyzingTokenCharOffsetsReader.java

License:Apache License

private int addFieldValue(String fieldName, int currInd, int charBase, String fieldValue,
        TokenCharOffsetRequests requests, RandomAccessCharOffsetContainer results) throws IOException {
    //Analyzer limitAnalyzer = new LimitTokenCountAnalyzer(baseAnalyzer, 10, true);
    TokenStream stream = baseAnalyzer.tokenStream(fieldName, fieldValue);
    stream.reset();//from w w w.ja va2  s .c om

    int defaultInc = 1;

    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    OffsetAttribute offsetAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    PositionIncrementAttribute incAtt = null;
    if (stream.hasAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class)) {
        incAtt = stream
                .getAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);
    }

    while (stream.incrementToken()) {

        //Do we need this?
        if (incAtt != null && incAtt.getPositionIncrement() == 0) {
            continue;
        }

        currInd += (incAtt != null) ? incAtt.getPositionIncrement() : defaultInc;
        if (requests.contains(currInd)) {
            results.add(currInd, offsetAtt.startOffset() + charBase, offsetAtt.endOffset() + charBase,
                    termAtt.toString());
        }
        if (currInd > requests.getLast()) {
            // TODO: Is there a way to avoid this? Or, is this
            // an imaginary performance hit?
            while (stream.incrementToken()) {
                //NO-OP
            }
            stream.end();
            stream.close();
            return GOT_ALL_REQUESTS;
        }
    }
    stream.end();
    stream.close();
    return currInd;
}

From source file:org.tallison.lucene.search.concordance.charoffsets.SimpleAnalyzerUtil.java

License:Apache License

/**
 * allows reuse of terms, this method calls terms.clear() before adding new
 * terms//from ww  w  .  j a  v a  2  s .  c o m
 *
 * @param s        string to analyze
 * @param field    to use in analysis
 * @param analyzer analyzer
 * @param terms    list for reuse
 * @return list of strings
 * @throws java.io.IOException if there's an IOException during analysis
 */
public static List<String> getTermStrings(String s, String field, Analyzer analyzer, List<String> terms)
        throws IOException {
    if (terms == null) {
        terms = new ArrayList<>();
    }
    terms.clear();
    TokenStream stream = analyzer.tokenStream(field, s);
    stream.reset();
    CharTermAttribute termAtt = stream
            .getAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);

    while (stream.incrementToken()) {
        terms.add(termAtt.toString());
    }
    stream.end();
    stream.close();

    return terms;
}

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testBasicNoUnigrams() throws Exception {
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, false);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream(ConcordanceTestBase.FIELD, s);
    tokenStream.reset();/*from w  w  w  .j a v  a2  s. c  o m*/
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> expected = Arrays.asList(new String[] { "a_b", "b_c", "c_d", "d_e", "e_f", "f_g", });

    List<String> returned = new ArrayList<>();
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        assertEquals(1, posIncAttribute.getPositionIncrement());
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}

From source file:org.tallison.lucene.search.concordance.TestBigramFilter.java

License:Apache License

@Test
public void testIncludeUnigrams() throws Exception {
    List<String> expected = Arrays.asList(
            new String[] { "a", "a_b", "b", "b_c", "c", "c_d", "d", "d_e", "e", "e_f", "f", "f_g", "g", });
    Analyzer analyzer = ConcordanceTestBase.getBigramAnalyzer(MockTokenFilter.EMPTY_STOPSET, 10, 10, true);

    String s = "a b c d e f g";
    TokenStream tokenStream = analyzer.tokenStream("f", s);
    tokenStream.reset();//  w w w .  j a  v a2  s . c  o m
    CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncAttribute = tokenStream.getAttribute(PositionIncrementAttribute.class);

    List<String> returned = new ArrayList<>();
    int i = 0;
    while (tokenStream.incrementToken()) {
        String token = charTermAttribute.toString();
        if (i++ % 2 == 0) {
            assertEquals(1, posIncAttribute.getPositionIncrement());
        } else {
            assertEquals(0, posIncAttribute.getPositionIncrement());
        }
        returned.add(token);
    }
    tokenStream.end();
    tokenStream.close();
    assertEquals(expected, returned);
}

From source file:org.tallison.lucene.search.concordance.TestConcordanceSearcher.java

License:Apache License

@Test
public void testCJKNoUnigrams() throws Exception {

    final CharacterRunAutomaton stops = MockTokenFilter.EMPTY_STOPSET;
    int posIncGap = 10;
    final int charOffsetGap = 10;
    Analyzer analyzer = getCJKBigramAnalyzer(false);
    TokenStream ts = analyzer.tokenStream(FIELD, "");
    ts.reset();//from   w  ww .ja v a  2  s.  c o m
    CharTermAttribute charTermAttribute = ts.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute positionIncrementAttribute = ts.getAttribute(PositionIncrementAttribute.class);

    ts.end();
    ts.close();
    String[] docs = new String[] { "" };

    Directory directory = getDirectory(analyzer, docs);
    IndexReader reader = DirectoryReader.open(directory);
    IndexSearcher indexSearcher = new IndexSearcher(reader);
    ConcordanceSearcher searcher = new ConcordanceSearcher(
            new WindowBuilder(2, 2, analyzer.getOffsetGap(FIELD)));
    Query q = new TermQuery(new Term(FIELD, ""));
    //now test straight and span wrapper
    ConcordanceWindowCollector collector = new ConcordanceWindowCollector(10);
    searcher.search(indexSearcher, FIELD, q, q, analyzer, collector);
    for (ConcordanceWindow w : collector.getWindows()) {
        //System.out.println(w);
    }
    reader.close();
    directory.close();

}

From source file:org.usergrid.utils.IndexUtils.java

License:Apache License

public static List<String> keywords(String source) {
    TokenStream ts = analyzer.tokenStream("keywords", new StringReader(source));
    List<String> keywords = new ArrayList<String>();
    try {//from  w w w.  ja v a2 s . co m
        while (ts.incrementToken()) {
            keywords.add(ts.getAttribute(TermAttribute.class).term());
        }
    } catch (IOException e) {
        logger.error("Error getting keywords ", e);
    }
    return keywords;
}