Example usage for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.aliasi.lingmed.medline.SearchableMedlineCodec.java

License:Lingpipe license

public static void main(String[] args) throws Exception {
    org.apache.lucene.store.RAMDirectory directory = new org.apache.lucene.store.RAMDirectory();

    // org.apache.lucene.analysis.SimpleAnalyzer analyzer 
    // = new org.apache.lucene.analysis.SimpleAnalyzer();
    // org.apache.lucene.analysis.KeywordAnalyzer analyzer 
    // = new org.apache.lucene.analysis.KeywordAnalyzer();
    MedlineCodec codec = new MedlineCodec();
    Analyzer analyzer = codec.getAnalyzer();

    org.apache.lucene.index.IndexWriterConfig iwConf = new org.apache.lucene.index.IndexWriterConfig(
            org.apache.lucene.util.Version.LUCENE_36, analyzer);
    iwConf.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    org.apache.lucene.index.IndexWriter indexWriter = new org.apache.lucene.index.IndexWriter(directory,
            iwConf);/*w ww. j a va  2  s.co m*/

    Document doc = new Document();
    doc.add(new Field(Fields.MESH_MINOR_FIELD, "abc", Field.Store.NO, Field.Index.ANALYZED));
    doc.add(new Field(Fields.MESH_MINOR_FIELD, " xyz efg", Field.Store.NO, Field.Index.ANALYZED));
    indexWriter.addDocument(doc);
    indexWriter.close();

    org.apache.lucene.index.IndexReader reader = org.apache.lucene.index.IndexReader.open(directory);
    org.apache.lucene.search.IndexSearcher searcher = new org.apache.lucene.search.IndexSearcher(reader);

    org.apache.lucene.queryParser.QueryParser qp = new org.apache.lucene.queryParser.QueryParser(
            org.apache.lucene.util.Version.LUCENE_36, "foo", analyzer);
    org.apache.lucene.search.Query query = qp.parse(Fields.MESH_MINOR_FIELD + ":efg");

    org.apache.lucene.search.TopDocs hits = searcher.search(query, 1000);
    System.out.println("hits.length()=" + hits.scoreDocs.length);

    org.apache.lucene.analysis.TokenStream ts = analyzer.tokenStream(Fields.MESH_MINOR_FIELD,
            new java.io.StringReader("abc xyz efg"));
    org.apache.lucene.analysis.tokenattributes.CharTermAttribute terms = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsets = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute positions = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);

    while (ts.incrementToken()) {
        int increment = positions.getPositionIncrement();
        int start = offsets.startOffset();
        int end = offsets.endOffset();
        String term = terms.toString();
        System.out.println("token=|" + term + "|" + " startOffset=" + start + " endOffset=" + end
                + " positionIncr=" + increment);
    }
}

From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java

License:Open Source License

static void tokenize(String text, BiConsumer<String, String> lambda) {
    try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
        TokenStream stream = analyzer.tokenStream("", text);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        TypeAttribute type = stream.getAttribute(TypeAttribute.class);
        stream.reset();//from  w w  w  .ja v  a  2 s .c om
        while (stream.incrementToken()) {
            lambda.accept(type.type(), term.toString());
        }
    } catch (IOException x) {
        throw new RuntimeException(x);
    }
}

From source file:com.b2international.index.compat.Highlighting.java

License:Apache License

/**
 * Splits a string to a list of tokens using the specified Lucene analyzer.
 * //  ww  w . java 2 s .c om
 * @param analyzer the analyzer determining token boundaries (may not be {@code null})
 * @param s the string to split
 * @return a list of tokens, or an empty list if {@code s} is {@code null} or empty
 */
public static List<String> split(Analyzer analyzer, final String s) {

    checkNotNull(analyzer, "analyzer");

    if (Strings.isNullOrEmpty(s)) {
        return ImmutableList.of();
    }

    final List<String> tokens = Lists.newArrayList();
    TokenStream stream = null;

    try {

        stream = analyzer.tokenStream(null, new StringReader(s));
        stream.reset();

        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }

    } catch (final IOException ignored) {
        // Should not be thrown when using a string reader
    } finally {
        endAndCloseQuietly(stream);
    }

    return tokens;
}

From source file:com.basistech.elasticsearch.index.analysis.rosette.SimpleRosetteAnalysisTests.java

License:Open Source License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();//from w ww . j  a va2s  . co  m
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    Assert.assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        String s = termAttr.toString();
        //out.printf("Output Token %2d: %s%n", i, s);
        Assert.assertTrue(i < expected.length, "got extra term: " + s);
        Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
        i++;
    }
    Assert.assertEquals(i, expected.length, "not all tokens produced");
}

From source file:com.basistech.IndexFiles.java

License:Open Source License

private void iterateOverFiles(File directory) throws IOException {
    File[] textFiles = directory.listFiles(new FilenameFilter() {
        public boolean accept(File dir, String name) {
            return name.endsWith(".txt");
        }//  w w  w .  java 2s  . c om
    });

    for (File dataFile : textFiles) {
        Reader dataReader = null;
        try {
            dataReader = Files.newReader(dataFile, Charsets.UTF_8);
            TokenStream tokenStream = analyzer.tokenStream("full_text", dataReader);
            tokenStream.reset();
            OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class);

            while (tokenStream.incrementToken()) {
                offsets.startOffset();
            }
        } finally {
            IOUtils.closeQuietly(dataReader);
        }
    }

}

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

protected String getTokenStream(Analyzer a, String text) throws IOException {
    StringBuffer sb = new StringBuffer();
    TokenStream s = a.tokenStream(null, new StringReader(text));
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        if (sb.length() != 0) {
            sb.append(" ");
        }//w  ww . ja  va2s. c o m
        sb.append(term.term());
    }
    return sb.toString();
}

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
    TokenStream s = a.tokenStream(null, new StringReader(text));
    int ix = 0;/*from w w  w.j  a v a2 s  .  c  o m*/
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        final String word = term.term();
        assertTrue(ix < expected.length);
        assertEquals(expected[ix++], word);
    }
    assertEquals(ix, expected.length);
}

From source file:com.bigdata.search.FullTextIndex.java

License:Open Source License

/**
 * Index a field in a document./*from w  w  w.java  2s  . co  m*/
 * <p>
 * Note: This method does NOT force a write on the indices. If the <i>buffer</i>
 * overflows, then there will be an index write. Once the caller is done
 * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
 * remaining in their <i>buffer</i> to the indices.
 * <p>
 * Note: If a document is pre-existing, then the existing data for that
 * document MUST be removed unless you know that the fields to be found in
 * the will not have changed (they may have different contents, but the same
 * fields exist in the old and new versions of the document).
 * 
 * @param buffer
 *            Used to buffer writes onto the text index.
 * @param docId
 *            The document identifier.
 * @param fieldId
 *            The field identifier.
 * @param languageCode
 *            The language code -or- <code>null</code> to use the default
 *            {@link Locale}.
 * @param r
 *            A reader on the text to be indexed.
 * @param filterStopwords
 *            if true, filter stopwords from the token stream            
 * 
 * @see TokenBuffer#flush()
 */
public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode,
        final Reader r, final boolean filterStopwords) {

    /*
     * Note: You can invoke this on a read-only index. It is only overflow
     * of the TokenBuffer that requires a writable index. Overflow itself
     * will only occur on {document,field} tuple boundaries, so it will
     * never overflow when indexing a search query.
     */
    //        assertWritable();

    int n = 0;

    // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
    final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords);

    try {

        while (tokenStream.incrementToken()) {

            final TermAttribute term = tokenStream.getAttribute(TermAttribute.class);

            buffer.add(docId, fieldId, term.term());

            n++;

        }

    } catch (IOException ioe) {

        throw new RuntimeException(ioe);

    }

    if (log.isInfoEnabled())
        log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId=" + fieldId);

}

From source file:com.billiger.solr.handler.component.QLTBComponent.java

License:Apache License

/**
 * Get analyzed version of the query string.
 *
 * This uses the analyzer for the configured FieldType for this
 * component to analyze and re-assemble the original query string.
 * If no queryFieldType is configured, the original query will be
 * returned.//from  ww w  . j a  va  2s .c om
 *
 * This is used both in the prepare() stage of the component and
 * when reading the QLTB map data.
 */
String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(query));
    tokens.reset();
    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    tokens.end();
    tokens.close();
    return norm.toString();
}

From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java

License:Apache License

private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException {
    if (null == ts)
        return;/*  ww w .j  a  v  a 2s.  c  o  m*/
    TokenStream stream = ts.stream;
    if (null == stream)
        return;

    DocTerms terms = doc.terms;
    if (null == doc.terms) {
        terms = new DocTerms();
        doc.terms = terms;
    }

    String token = null;
    int offset = 0;
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        token = termA.toString();
        offset = offsetA.startOffset();
        Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset);
        terms.getTermList().add(term);
    }
    stream.close();
}