Example usage for org.apache.lucene.analysis TokenStream addAttribute

List of usage examples for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass) 

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsSynonymFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {/*from   www . j  a  v a 2 s .  co  m*/
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Kln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsFilter.java

License:Apache License

/**
 * Test program// w ww .  ja  v a  2s.com
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {
        ts = new TranscribeGermanAccentsFilter(ts);

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {/*from  w w w .j  a  v  a 2  s.  com*/
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        ts.reset();
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.index.BLDefaultAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij !  ?  ?. ]'      ??. ";

    Analyzer a = new BLDefaultAnalyzer();
    try {//from  w w  w . j av a2 s .  co m
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.index.complex.TokenStreamFromList.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream s = new TokenStreamFromList(Arrays.asList("a", "b", "c"), Arrays.asList(1, 1, 1));
    try {//w  w  w.ja  v a2s.c om
        CharTermAttribute term = s.addAttribute(CharTermAttribute.class);
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        System.out.println(s.incrementToken());
    } finally {
        s.close();
    }
}

From source file:nl.uva.sne.commons.SemanticUtils.java

public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException {
    text = text.replaceAll("", "'");

    text = text.replaceAll("_", " ");
    text = text.replaceAll("[0-9]", "");
    text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " ");

    text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    text = text.toLowerCase();//  w  w  w .  j  a v  a  2s . co m

    TokenStream tokenStream;
    if (stem) {
        tokenStream = tokenStemStream("field", new StringReader(text));
    } else {
        tokenStream = tokenStream("field", new StringReader(text));
    }

    ArrayList<String> words = new ArrayList<>();
    try {
        CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            words.add(term.toString());
        }
        tokenStream.end();
    } finally {
        tokenStream.close();
    }
    //        Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens");
    return words;
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java

License:Open Source License

/**
 * Check that the TokenStream yields the exact tokens specified.
 * Note that order is not checked, since the map of locales will not provide a
 * predictable ordering when enumerated.
 * /*from  ww w . ja  v  a  2  s.c o m*/
 * The expected list of tokens may contain the same token more than once and
 * the number of instances will have to match the number found in the stream.
 * 
 * @param ts              TokenStream to inspect.
 * @param expectedTokens  List of tokens in the order expected from the stream.
 * @throws IOException
 */
private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException {
    final int expectedCount = expectedTokens.size();
    int count = 0;

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            System.out.println("Token: " + termAtt.toString());
            if (expectedTokens.contains(termAtt.toString())) {
                // remove an instance of the term text so that it is not matched again
                expectedTokens.remove(termAtt.toString());
            } else {
                fail("Unexpected token: " + termAtt.toString());
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    assertEquals("Incorrect number of tokens generated.", expectedCount, count);
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java

License:Open Source License

private void tokenise(TokenStream ts, String[] tokens) throws IOException {
    int i = 0;//from   w ww. ja va2 s. c o m

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + ts.reflectAsString(true));

            String termText = termAtt.toString();

            if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) {
                assert (i % 2 == 1);
                assertEquals(termText, tokens[i++]);
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    if (i != tokens.length) {
        fail("Invalid number of tokens, found " + i + " and expected " + tokens.length);
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java

License:Apache License

/**
 * Returns a document that is finished with text extraction and is ready to
 * be added to the index.// w ww.  j a  va2s  .  com
 *
 * @param doc the document to check.
 * @return <code>doc</code> if it is finished already or a stripped down
 *         copy of <code>doc</code> without text extractors.
 * @throws IOException if the document cannot be added to the indexing
 *                     queue.
 */
private Document getFinishedDocument(Document doc) throws IOException {
    if (!Util.isDocumentReady(doc)) {
        Document copy = new Document();
        // mark the document that reindexing is required
        copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO,
                Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
        for (Fieldable f : doc.getFields()) {
            Fieldable field = null;
            Field.TermVector tv = getTermVectorParameter(f);
            Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO;
            Field.Index indexed = getIndexParameter(f);
            if (f instanceof LazyTextExtractorField || f.readerValue() != null) {
                // replace all readers with empty string reader
                field = new Field(f.name(), new StringReader(""), tv);
            } else if (f.stringValue() != null) {
                field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv);
            } else if (f.isBinary()) {
                field = new Field(f.name(), f.getBinaryValue(), stored);
            } else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) {
                TokenStream tokenStream = f.tokenStreamValue();
                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
                tokenStream.incrementToken();
                String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
                tokenStream.reset();
                field = new Field(f.name(),
                        new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone()));
            }
            if (field != null) {
                field.setOmitNorms(f.getOmitNorms());
                copy.add(field);
            }
        }
        // schedule the original document for later indexing
        Document existing = indexingQueue.addDocument(doc);
        if (existing != null) {
            // the queue already contained a pending document for this
            // node. -> dispose the document
            Util.disposeDocument(existing);
        }
        // use the stripped down copy for now
        doc = copy;
    }
    return doc;
}

From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java

License:Apache License

/**
 * {@inheritDoc}//w  ww  . j av a 2s .com
 */
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
    // only create a prefix query when the term is a single word / token
    Analyzer a = getAnalyzer();
    TokenStream ts = a.tokenStream(field, new StringReader(termStr));
    int count = 0;
    boolean isCJ = false;
    try {
        TypeAttribute t = ts.addAttribute(TypeAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new ParseException(e.getMessage());
    } finally {
        try {
            ts.close();
        } catch (IOException e) {
            // ignore
        }
    }
    if (count > 1 && isCJ) {
        return getFieldQuery(field, termStr);
    } else {
        return getWildcardQuery(field, termStr + "*");
    }
}