Example usage for org.apache.lucene.analysis TokenStream addAttribute

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream addAttribute.

Prototype

public final <T extends Attribute> T addAttribute(Class<T> attClass)

Source Link

Document

The caller must pass in a Class<?

Usage

From source file:nl.inl.blacklab.filter.TestTranscribeGermanAccentsSynonymFilter.java

License:Apache License

@Test
public void testRetrieve() throws IOException {
    TokenStream ts = new StubTokenStream(new String[] { "Kln", "Berlin" });
    try {/*from   www . j  a  v a 2 s .  co  m*/
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Kln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Koeln", new String(ta.buffer(), 0, ta.length()));
        Assert.assertTrue(ts.incrementToken());
        Assert.assertEquals("Berlin", new String(ta.buffer(), 0, ta.length()));
        Assert.assertFalse(ts.incrementToken());
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsFilter.java

License:Apache License

/**
 * Test program// w ww .  ja  v a  2s.com
 * @param args
 * @throws IOException
 */
public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {
        ts = new TranscribeGermanAccentsFilter(ts);

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.filter.TranscribeGermanAccentsSynonymFilter.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream ts = new WhitespaceTokenizer(Version.LUCENE_42,
            new StringReader("Aachen Dsseldorf Kln Berlin sterreich"));
    try {/*from  w w w .j  a  v  a 2  s.  com*/
        ts = new TranscribeGermanAccentsSynonymFilter(ts);
        ts.reset();
        ts = new RemoveAllAccentsFilter(ts);
        ts.reset();

        CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(term.buffer(), 0, term.length()));
        }
    } finally {
        ts.close();
    }
}

From source file:nl.inl.blacklab.index.BLDefaultAnalyzer.java

License:Apache License

public static void main(String[] args) throws IOException {
    String TEST_STR = "H jij !  ?  ?. ]'      ??. ";

    Analyzer a = new BLDefaultAnalyzer();
    try {//from  w w  w . j av a2 s .  co m
        TokenStream ts = a.tokenStream("test", new StringReader(TEST_STR));
        CharTermAttribute ta = ts.addAttribute(CharTermAttribute.class);
        while (ts.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
        TokenStream ts2 = a.tokenStream(ComplexFieldUtil.propertyField("test", null, "s"),
                new StringReader(TEST_STR));
        ta = ts2.addAttribute(CharTermAttribute.class);
        while (ts2.incrementToken()) {
            System.out.println(new String(ta.buffer(), 0, ta.length()));
        }
    } finally {
        a.close();
    }
}

From source file:nl.inl.blacklab.index.complex.TokenStreamFromList.java

License:Apache License

public static void main(String[] args) throws IOException {
    TokenStream s = new TokenStreamFromList(Arrays.asList("a", "b", "c"), Arrays.asList(1, 1, 1));
    try {//w  w  w.ja  v a2s.c om
        CharTermAttribute term = s.addAttribute(CharTermAttribute.class);
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        s.incrementToken();
        System.out.println(new String(term.buffer(), 0, term.length()));
        System.out.println(s.incrementToken());
    } finally {
        s.close();
    }
}

From source file:nl.uva.sne.commons.SemanticUtils.java

public static List<String> tokenize(String text, boolean stem) throws IOException, JWNLException {
    text = text.replaceAll("", "'");

    text = text.replaceAll("_", " ");
    text = text.replaceAll("[0-9]", "");
    text = text.replaceAll("[\\p{Punct}&&[^'-]]+", " ");

    text = text.replaceAll("(?:'(?:[tdsm]|[vr]e|ll))+\\b", "");
    text = text.toLowerCase();//  w  w  w .  j  a v  a  2s . co m

    TokenStream tokenStream;
    if (stem) {
        tokenStream = tokenStemStream("field", new StringReader(text));
    } else {
        tokenStream = tokenStream("field", new StringReader(text));
    }

    ArrayList<String> words = new ArrayList<>();
    try {
        CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            words.add(term.toString());
        }
        tokenStream.end();
    } finally {
        tokenStream.close();
    }
    //        Logger.getLogger(SemanticUtils.class.getName()).log(Level.INFO, "Returning {0}:", words.size() + " tokens");
    return words;
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.MLAnalayserTest.java

License:Open Source License

/**
 * Check that the TokenStream yields the exact tokens specified.
 * Note that order is not checked, since the map of locales will not provide a
 * predictable ordering when enumerated.
 * /*from  ww w . ja  v  a  2  s.c o m*/
 * The expected list of tokens may contain the same token more than once and
 * the number of instances will have to match the number found in the stream.
 * 
 * @param ts              TokenStream to inspect.
 * @param expectedTokens  List of tokens in the order expected from the stream.
 * @throws IOException
 */
private void verifyTokenStream(TokenStream ts, List<String> expectedTokens) throws IOException {
    final int expectedCount = expectedTokens.size();
    int count = 0;

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            System.out.println("Token: " + termAtt.toString());
            if (expectedTokens.contains(termAtt.toString())) {
                // remove an instance of the term text so that it is not matched again
                expectedTokens.remove(termAtt.toString());
            } else {
                fail("Unexpected token: " + termAtt.toString());
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    assertEquals("Incorrect number of tokens generated.", expectedCount, count);
}

From source file:org.alfresco.repo.search.impl.lucene.analysis.PathTokenFilterTest.java

License:Open Source License

private void tokenise(TokenStream ts, String[] tokens) throws IOException {
    int i = 0;//from   w ww. ja va2 s. c o m

    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = ts.addAttribute(TypeAttribute.class);

    try {
        ts.reset();
        while (ts.incrementToken()) {
            System.out.println("token: " + ts.reflectAsString(true));

            String termText = termAtt.toString();

            if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAMESPACE_PREFIX)) {
                assert (i % 2 == 0);
                assertEquals(termText, tokens[i++]);
            } else if (typeAtt.type().equals(PathTokenFilter.TOKEN_TYPE_PATH_ELEMENT_NAME)) {
                assert (i % 2 == 1);
                assertEquals(termText, tokens[i++]);
            }
        }
        ts.end();
    } finally {
        ts.close();
    }

    if (i != tokens.length) {
        fail("Invalid number of tokens, found " + i + " and expected " + tokens.length);
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.AbstractIndex.java

License:Apache License

/**
 * Returns a document that is finished with text extraction and is ready to
 * be added to the index.// w ww.  j a  va2s  .  com
 *
 * @param doc the document to check.
 * @return <code>doc</code> if it is finished already or a stripped down
 *         copy of <code>doc</code> without text extractors.
 * @throws IOException if the document cannot be added to the indexing
 *                     queue.
 */
private Document getFinishedDocument(Document doc) throws IOException {
    if (!Util.isDocumentReady(doc)) {
        Document copy = new Document();
        // mark the document that reindexing is required
        copy.add(new Field(FieldNames.REINDEXING_REQUIRED, false, "", Field.Store.NO,
                Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO));
        for (Fieldable f : doc.getFields()) {
            Fieldable field = null;
            Field.TermVector tv = getTermVectorParameter(f);
            Field.Store stored = f.isStored() ? Field.Store.YES : Field.Store.NO;
            Field.Index indexed = getIndexParameter(f);
            if (f instanceof LazyTextExtractorField || f.readerValue() != null) {
                // replace all readers with empty string reader
                field = new Field(f.name(), new StringReader(""), tv);
            } else if (f.stringValue() != null) {
                field = new Field(f.name(), false, f.stringValue(), stored, indexed, tv);
            } else if (f.isBinary()) {
                field = new Field(f.name(), f.getBinaryValue(), stored);
            } else if (f.tokenStreamValue() != null && f.tokenStreamValue() instanceof SingletonTokenStream) {
                TokenStream tokenStream = f.tokenStreamValue();
                TermAttribute termAttribute = tokenStream.addAttribute(TermAttribute.class);
                PayloadAttribute payloadAttribute = tokenStream.addAttribute(PayloadAttribute.class);
                tokenStream.incrementToken();
                String value = new String(termAttribute.termBuffer(), 0, termAttribute.termLength());
                tokenStream.reset();
                field = new Field(f.name(),
                        new SingletonTokenStream(value, (Payload) payloadAttribute.getPayload().clone()));
            }
            if (field != null) {
                field.setOmitNorms(f.getOmitNorms());
                copy.add(field);
            }
        }
        // schedule the original document for later indexing
        Document existing = indexingQueue.addDocument(doc);
        if (existing != null) {
            // the queue already contained a pending document for this
            // node. -> dispose the document
            Util.disposeDocument(existing);
        }
        // use the stripped down copy for now
        doc = copy;
    }
    return doc;
}

From source file:org.apache.jackrabbit.core.query.lucene.JackrabbitQueryParser.java

License:Apache License

/**
 * {@inheritDoc}//w  ww  . j av a 2s .com
 */
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
    // only create a prefix query when the term is a single word / token
    Analyzer a = getAnalyzer();
    TokenStream ts = a.tokenStream(field, new StringReader(termStr));
    int count = 0;
    boolean isCJ = false;
    try {
        TypeAttribute t = ts.addAttribute(TypeAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            count++;
            isCJ = StandardTokenizer.TOKEN_TYPES[StandardTokenizer.CJ].equals(t.type());
        }
        ts.end();
    } catch (IOException e) {
        throw new ParseException(e.getMessage());
    } finally {
        try {
            ts.close();
        } catch (IOException e) {
            // ignore
        }
    }
    if (count > 1 && isCJ) {
        return getFieldQuery(field, termStr);
    } else {
        return getWildcardQuery(field, termStr + "*");
    }
}