Example usage for org.apache.lucene.store ByteArrayDataInput readVInt

Introduction

In this page you can find the example usage for org.apache.lucene.store ByteArrayDataInput readVInt.

Prototype

@Override
    public int readVInt()

Source Link

Usage

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);/*from   ww w . j a v  a 2s.  co m*/

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);/*from  w w  w  .  ja  v a2  s .c  o  m*/

    writer.close();

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.AnnotationsUtils.java

License:Apache License

public static void printAnnotations(LeafReader reader, Term term) throws IOException {
    System.out.println("Annotations for " + term);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS);
    for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) {
        final int freq = postings.freq();
        System.out.println("  doc=" + docID + ", freq=" + freq);
        for (int i = 0; i < freq; i++) {
            postings.nextPosition();//from   w  w w .j  ava 2 s.co m
            final BytesRef payload = postings.getPayload();
            in.reset(payload.bytes, payload.offset, payload.length);
            System.out.println("    start=" + in.readVInt() + ", length=" + in.readVInt());
        }
    }
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();//from  w w w. ja v a 2 s . c  o m
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizer.java

License:Apache License

void tokenizeWholeBlock() {
    queue.clear();//w w  w .j ava2  s  .com
    int nextStart = 0;
    final int end = block.length();
    boolean afterSynonymProduced = false;
    final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
    for (int idx = 0; idx < synonyms.size(); idx++) {
        final MyToken synonym = synonyms.get(idx);
        tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

        // enqueue prev-synonym
        if (expand) {
            int limitOffset = 0;
            if (idx > 0) {
                limitOffset = synonyms.get(idx - 1).endOffset;
            }
            processPrevSynonym(synonym.startOffset, limitOffset);
        }

        // enqueue synonyms
        if (expand) {
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            // final boolean keepOrig = (code & 0x1) == 0; // not used
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                map.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                final String word = scratchChars.toString();
                int posInc = 0, seq = i + 1;
                if (synonym.word.equals(word)) {
                    posInc = 1;
                    seq = 0;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
            }
        } else {
            queue.add(synonym);
        }

        // enqueue after-synonym
        if (expand) {
            int limitOffset = block.length();
            if (idx < synonyms.size() - 1) {
                limitOffset = synonyms.get(idx + 1).startOffset;
            }
            afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset);
        }

        nextStart = synonym.endOffset;
    }
    tokenizePartialBlock(nextStart, end, afterSynonymProduced);
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizer.java

License:Apache License

void tokenizeWholeBlock() {
    queue.clear();//from  w w  w  .j  av  a 2s. c om
    int nextStart = 0;
    int end = block.length();
    boolean afterSynonymProduced = false;
    final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
    for (int idx = 0; idx < synonyms.size(); idx++) {
        MyToken synonym = synonyms.get(idx);
        tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

        // enqueue prev-synonym
        if (expand) {
            int limitOffset = 0;
            if (idx > 0)
                limitOffset = synonyms.get(idx - 1).endOffset;
            processPrevSynonym(synonym.startOffset, limitOffset);
        }

        // enqueue synonyms
        if (expand) {
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                synonymMap.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                String word = scratchChars.toString();
                int posInc = 0, seq = i + 1;
                if (synonym.word.equals(word)) {
                    posInc = 1;
                    seq = 0;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
            }
        } else {
            queue.add(synonym);
        }

        // enqueue after-synonym
        if (expand) {
            int limitOffset = block.length();
            if (idx < synonyms.size() - 1)
                limitOffset = synonyms.get(idx + 1).startOffset;
            afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset);
        }

        nextStart = synonym.endOffset;
    }
    tokenizePartialBlock(nextStart, end, afterSynonymProduced);
}