Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:com.aliasi.lingmed.medline.SearchableMedlineCodec.java

License:Lingpipe license

public static void main(String[] args) throws Exception {
    org.apache.lucene.store.RAMDirectory directory = new org.apache.lucene.store.RAMDirectory();

    // org.apache.lucene.analysis.SimpleAnalyzer analyzer 
    // = new org.apache.lucene.analysis.SimpleAnalyzer();
    // org.apache.lucene.analysis.KeywordAnalyzer analyzer 
    // = new org.apache.lucene.analysis.KeywordAnalyzer();
    MedlineCodec codec = new MedlineCodec();
    Analyzer analyzer = codec.getAnalyzer();

    org.apache.lucene.index.IndexWriterConfig iwConf = new org.apache.lucene.index.IndexWriterConfig(
            org.apache.lucene.util.Version.LUCENE_36, analyzer);
    iwConf.setOpenMode(org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND);

    org.apache.lucene.index.IndexWriter indexWriter = new org.apache.lucene.index.IndexWriter(directory,
            iwConf);/*w ww. j a va  2  s.co m*/

    Document doc = new Document();
    doc.add(new Field(Fields.MESH_MINOR_FIELD, "abc", Field.Store.NO, Field.Index.ANALYZED));
    doc.add(new Field(Fields.MESH_MINOR_FIELD, " xyz efg", Field.Store.NO, Field.Index.ANALYZED));
    indexWriter.addDocument(doc);
    indexWriter.close();

    org.apache.lucene.index.IndexReader reader = org.apache.lucene.index.IndexReader.open(directory);
    org.apache.lucene.search.IndexSearcher searcher = new org.apache.lucene.search.IndexSearcher(reader);

    org.apache.lucene.queryParser.QueryParser qp = new org.apache.lucene.queryParser.QueryParser(
            org.apache.lucene.util.Version.LUCENE_36, "foo", analyzer);
    org.apache.lucene.search.Query query = qp.parse(Fields.MESH_MINOR_FIELD + ":efg");

    org.apache.lucene.search.TopDocs hits = searcher.search(query, 1000);
    System.out.println("hits.length()=" + hits.scoreDocs.length);

    org.apache.lucene.analysis.TokenStream ts = analyzer.tokenStream(Fields.MESH_MINOR_FIELD,
            new java.io.StringReader("abc xyz efg"));
    org.apache.lucene.analysis.tokenattributes.CharTermAttribute terms = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class);
    org.apache.lucene.analysis.tokenattributes.OffsetAttribute offsets = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute.class);
    org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute positions = ts
            .addAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute.class);

    while (ts.incrementToken()) {
        int increment = positions.getPositionIncrement();
        int start = offsets.startOffset();
        int end = offsets.endOffset();
        String term = terms.toString();
        System.out.println("token=|" + term + "|" + " startOffset=" + start + " endOffset=" + end
                + " positionIncr=" + increment);
    }
}

From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java

License:Open Source License

static void tokenize(String text, BiConsumer<String, String> lambda) {
    try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
        TokenStream stream = analyzer.tokenStream("", text);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        TypeAttribute type = stream.getAttribute(TypeAttribute.class);
        stream.reset();//from  w w  w  .ja v  a  2 s .c om
        while (stream.incrementToken()) {
            lambda.accept(type.type(), term.toString());
        }
    } catch (IOException x) {
        throw new RuntimeException(x);
    }
}

From source file:com.b2international.index.compat.Highlighting.java

License:Apache License

/**
 * Splits a string to a list of tokens using the specified Lucene analyzer.
 * //  ww  w . java 2 s .c om
 * @param analyzer the analyzer determining token boundaries (may not be {@code null})
 * @param s the string to split
 * @return a list of tokens, or an empty list if {@code s} is {@code null} or empty
 */
public static List<String> split(Analyzer analyzer, final String s) {

    checkNotNull(analyzer, "analyzer");

    if (Strings.isNullOrEmpty(s)) {
        return ImmutableList.of();
    }

    final List<String> tokens = Lists.newArrayList();
    TokenStream stream = null;

    try {

        stream = analyzer.tokenStream(null, new StringReader(s));
        stream.reset();

        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }

    } catch (final IOException ignored) {
        // Should not be thrown when using a string reader
    } finally {
        endAndCloseQuietly(stream);
    }

    return tokens;
}

From source file:com.basistech.elasticsearch.index.analysis.rosette.SimpleRosetteAnalysisTests.java

License:Open Source License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();//from w ww . j  a va2s  . co  m
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    Assert.assertNotNull(termAttr);
    int i = 0;
    while (stream.incrementToken()) {
        String s = termAttr.toString();
        //out.printf("Output Token %2d: %s%n", i, s);
        Assert.assertTrue(i < expected.length, "got extra term: " + s);
        Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
        i++;
    }
    Assert.assertEquals(i, expected.length, "not all tokens produced");
}

From source file:com.basistech.IndexFiles.java

License:Open Source License

private void iterateOverFiles(File directory) throws IOException {
    File[] textFiles = directory.listFiles(new FilenameFilter() {
        public boolean accept(File dir, String name) {
            return name.endsWith(".txt");
        }//  w w  w .  java 2s  . c om
    });

    for (File dataFile : textFiles) {
        Reader dataReader = null;
        try {
            dataReader = Files.newReader(dataFile, Charsets.UTF_8);
            TokenStream tokenStream = analyzer.tokenStream("full_text", dataReader);
            tokenStream.reset();
            OffsetAttribute offsets = tokenStream.getAttribute(OffsetAttribute.class);

            while (tokenStream.incrementToken()) {
                offsets.startOffset();
            }
        } finally {
            IOUtils.closeQuietly(dataReader);
        }
    }

}

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

protected String getTokenStream(Analyzer a, String text) throws IOException {
    StringBuffer sb = new StringBuffer();
    TokenStream s = a.tokenStream(null, new StringReader(text));
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        if (sb.length() != 0) {
            sb.append(" ");
        }//w  ww . ja  va2s. c o m
        sb.append(term.term());
    }
    return sb.toString();
}

From source file:com.bigdata.search.AbstractSearchTest.java

License:Open Source License

private void compareTokenStream(Analyzer a, String text, String expected[]) throws IOException {
    TokenStream s = a.tokenStream(null, new StringReader(text));
    int ix = 0;/*from w w  w.j  a v a2 s  .  c  o m*/
    while (s.incrementToken()) {
        final TermAttribute term = s.getAttribute(TermAttribute.class);
        final String word = term.term();
        assertTrue(ix < expected.length);
        assertEquals(expected[ix++], word);
    }
    assertEquals(ix, expected.length);
}

From source file:com.bigdata.search.FullTextIndex.java

License:Open Source License

/**
 * Index a field in a document./*from w  w  w.java  2s  . co  m*/
 * <p>
 * Note: This method does NOT force a write on the indices. If the <i>buffer</i>
 * overflows, then there will be an index write. Once the caller is done
 * indexing, they MUST invoke {@link TokenBuffer#flush()} to force any data
 * remaining in their <i>buffer</i> to the indices.
 * <p>
 * Note: If a document is pre-existing, then the existing data for that
 * document MUST be removed unless you know that the fields to be found in
 * the will not have changed (they may have different contents, but the same
 * fields exist in the old and new versions of the document).
 * 
 * @param buffer
 *            Used to buffer writes onto the text index.
 * @param docId
 *            The document identifier.
 * @param fieldId
 *            The field identifier.
 * @param languageCode
 *            The language code -or- <code>null</code> to use the default
 *            {@link Locale}.
 * @param r
 *            A reader on the text to be indexed.
 * @param filterStopwords
 *            if true, filter stopwords from the token stream            
 * 
 * @see TokenBuffer#flush()
 */
public void index(final TokenBuffer<V> buffer, final V docId, final int fieldId, final String languageCode,
        final Reader r, final boolean filterStopwords) {

    /*
     * Note: You can invoke this on a read-only index. It is only overflow
     * of the TokenBuffer that requires a writable index. Overflow itself
     * will only occur on {document,field} tuple boundaries, so it will
     * never overflow when indexing a search query.
     */
    //        assertWritable();

    int n = 0;

    // tokenize (note: docId,fieldId are not on the tokenStream, but the field could be).
    final TokenStream tokenStream = getTokenStream(languageCode, r, filterStopwords);

    try {

        while (tokenStream.incrementToken()) {

            final TermAttribute term = tokenStream.getAttribute(TermAttribute.class);

            buffer.add(docId, fieldId, term.term());

            n++;

        }

    } catch (IOException ioe) {

        throw new RuntimeException(ioe);

    }

    if (log.isInfoEnabled())
        log.info("Indexed " + n + " tokens: docId=" + docId + ", fieldId=" + fieldId);

}

From source file:com.billiger.solr.handler.component.QLTBComponent.java

License:Apache License

/**
 * Get analyzed version of the query string.
 *
 * This uses the analyzer for the configured FieldType for this
 * component to analyze and re-assemble the original query string.
 * If no queryFieldType is configured, the original query will be
 * returned.//from  ww w  . j a  va  2s .c om
 *
 * This is used both in the prepare() stage of the component and
 * when reading the QLTB map data.
 */
String getAnalyzedQuery(String query) throws IOException {
    if (analyzer == null) {
        return query;
    }
    StringBuilder norm = new StringBuilder();
    TokenStream tokens = analyzer.tokenStream("", new StringReader(query));
    tokens.reset();
    CharTermAttribute termAtt = tokens.addAttribute(CharTermAttribute.class);
    while (tokens.incrementToken()) {
        norm.append(termAtt.buffer(), 0, termAtt.length());
    }
    tokens.end();
    tokens.close();
    return norm.toString();
}

From source file:com.bizosys.hsearch.inpipe.ComputeTokens.java

License:Apache License

private void tokenize(Doc doc, TermStream ts) throws SystemFault, ApplicationFault, IOException {
    if (null == ts)
        return;/*  ww w .j  a  v  a 2s.  c  o  m*/
    TokenStream stream = ts.stream;
    if (null == stream)
        return;

    DocTerms terms = doc.terms;
    if (null == doc.terms) {
        terms = new DocTerms();
        doc.terms = terms;
    }

    String token = null;
    int offset = 0;
    CharTermAttribute termA = (CharTermAttribute) stream.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetA = (OffsetAttribute) stream.getAttribute(OffsetAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        token = termA.toString();
        offset = offsetA.startOffset();
        Term term = new Term(doc.tenant, token, ts.sighting, ts.type, offset);
        terms.getTermList().add(term);
    }
    stream.close();
}