Example usage for org.apache.lucene.store ByteArrayDataInput ByteArrayDataInput

List of usage examples for org.apache.lucene.store ByteArrayDataInput ByteArrayDataInput

Introduction

In this page you can find the example usage for org.apache.lucene.store ByteArrayDataInput ByteArrayDataInput.

Prototype

public ByteArrayDataInput() 

Source Link

Usage

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);//from  w w  w.  j  a va2 s  .  c  o m

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);/*  w ww  .j a va2 s . com*/

    writer.close();

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.AnnotationsUtils.java

License:Apache License

public static void printAnnotations(LeafReader reader, Term term) throws IOException {
    System.out.println("Annotations for " + term);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS);
    for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) {
        final int freq = postings.freq();
        System.out.println("  doc=" + docID + ", freq=" + freq);
        for (int i = 0; i < freq; i++) {
            postings.nextPosition();/*ww w. j a v  a2s  . co m*/
            final BytesRef payload = postings.getPayload();
            in.reset(payload.bytes, payload.offset, payload.length);
            System.out.println("    start=" + in.readVInt() + ", length=" + in.readVInt());
        }
    }
}

From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java

License:Apache License

private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException {
    ts.reset();//from  w w  w  .  jav  a 2 s . co m
    final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class);
    final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    int pos = -1;
    for (final TokenInfo info : infos) {
        assertThat(ts.incrementToken()).isTrue();
        pos += posIncrAtt.getPositionIncrement();
        int len = -1;
        final BytesRef payload = payloadAtt.getPayload();
        if (info.len != -1) {
            assertThat(payload).isNotNull();
            in.reset(payload.bytes);
            len = in.readVInt();
        } else {
            assertThat(payload).isNull();
        }
        assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info);
    }
    assertThat(ts.incrementToken()).isFalse();
}

From source file:com.shaie.annots.SpanAnnotationTermQuery.java

License:Apache License

@Override
public Spans getSpans(LeafReaderContext context, Bits acceptDocs, Map<Term, TermContext> termContexts)
        throws IOException {
    final Spans spans = super.getSpans(context, acceptDocs, termContexts);
    return new Spans() {
        private int start, end;
        final ByteArrayDataInput in = new ByteArrayDataInput();

        @Override//from   w w w .jav  a 2  s  .c  o m
        public int start() {
            return start;
        }

        @Override
        public boolean skipTo(int target) throws IOException {
            return spans.skipTo(target);
        }

        @Override
        public boolean next() throws IOException {
            if (!spans.next()) {
                return false;
            }
            if (!isPayloadAvailable()) {
                return next();
            }
            byte[] payload = getPayload().iterator().next();
            in.reset(payload);
            start = in.readVInt();
            end = in.readVInt() + start - 1; // end is inclusive
            return true;
        }

        @Override
        public boolean isPayloadAvailable() throws IOException {
            return spans.isPayloadAvailable();
        }

        @Override
        public Collection<byte[]> getPayload() throws IOException {
            return spans.getPayload();
        }

        @Override
        public int end() {
            return end;
        }

        @Override
        public int doc() {
            return spans.doc();
        }

        @Override
        public long cost() {
            return spans.cost();
        }
    };
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizer.java

License:Apache License

void tokenizeWholeBlock() {
    queue.clear();//from   w w w .jav a2 s  .c  o m
    int nextStart = 0;
    final int end = block.length();
    boolean afterSynonymProduced = false;
    final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
    for (int idx = 0; idx < synonyms.size(); idx++) {
        final MyToken synonym = synonyms.get(idx);
        tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

        // enqueue prev-synonym
        if (expand) {
            int limitOffset = 0;
            if (idx > 0) {
                limitOffset = synonyms.get(idx - 1).endOffset;
            }
            processPrevSynonym(synonym.startOffset, limitOffset);
        }

        // enqueue synonyms
        if (expand) {
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            // final boolean keepOrig = (code & 0x1) == 0; // not used
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                map.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                final String word = scratchChars.toString();
                int posInc = 0, seq = i + 1;
                if (synonym.word.equals(word)) {
                    posInc = 1;
                    seq = 0;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
            }
        } else {
            queue.add(synonym);
        }

        // enqueue after-synonym
        if (expand) {
            int limitOffset = block.length();
            if (idx < synonyms.size() - 1) {
                limitOffset = synonyms.get(idx + 1).startOffset;
            }
            afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset);
        }

        nextStart = synonym.endOffset;
    }
    tokenizePartialBlock(nextStart, end, afterSynonymProduced);
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizer.java

License:Apache License

void tokenizeWholeBlock() {
    queue.clear();//from w  w w. j av a  2 s. c  o m
    int nextStart = 0;
    int end = block.length();
    boolean afterSynonymProduced = false;
    final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
    for (int idx = 0; idx < synonyms.size(); idx++) {
        MyToken synonym = synonyms.get(idx);
        tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

        // enqueue prev-synonym
        if (expand) {
            int limitOffset = 0;
            if (idx > 0)
                limitOffset = synonyms.get(idx - 1).endOffset;
            processPrevSynonym(synonym.startOffset, limitOffset);
        }

        // enqueue synonyms
        if (expand) {
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                synonymMap.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                String word = scratchChars.toString();
                int posInc = 0, seq = i + 1;
                if (synonym.word.equals(word)) {
                    posInc = 1;
                    seq = 0;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
            }
        } else {
            queue.add(synonym);
        }

        // enqueue after-synonym
        if (expand) {
            int limitOffset = block.length();
            if (idx < synonyms.size() - 1)
                limitOffset = synonyms.get(idx + 1).startOffset;
            afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset);
        }

        nextStart = synonym.endOffset;
    }
    tokenizePartialBlock(nextStart, end, afterSynonymProduced);
}

From source file:org.elasticsearch.index.fielddata.plain.BinaryDVNumericAtomicFieldData.java

License:Apache License

@Override
public LongValues getLongValues() {
    if (numericType.isFloatingPoint()) {
        return LongValues.asLongValues(getDoubleValues());
    }/*from w w  w. j a  v a  2s  .c  o  m*/
    return new LongValues(true) {

        final BytesRef bytes = new BytesRef();
        final ByteArrayDataInput in = new ByteArrayDataInput();
        long[] longs = new long[8];
        int i = Integer.MAX_VALUE;
        int valueCount = 0;

        @Override
        public int setDocument(int docId) {
            values.get(docId, bytes);
            in.reset(bytes.bytes, bytes.offset, bytes.length);
            if (!in.eof()) {
                // first value uses vLong on top of zig-zag encoding, then deltas are encoded using vLong
                long previousValue = longs[0] = ByteUtils.zigZagDecode(ByteUtils.readVLong(in));
                valueCount = 1;
                while (!in.eof()) {
                    longs = ArrayUtil.grow(longs, valueCount + 1);
                    previousValue = longs[valueCount++] = previousValue + ByteUtils.readVLong(in);
                }
            } else {
                valueCount = 0;
            }
            i = 0;
            return valueCount;
        }

        @Override
        public long nextValue() {
            assert i < valueCount;
            return longs[i++];
        }

    };
}

From source file:org.elasticsearch.index.fielddata.plain.BytesBinaryDVAtomicFieldData.java

License:Apache License

@Override
public SortedBinaryDocValues getBytesValues() {
    return new SortedBinaryDocValues() {

        int count;
        BytesRefBuilder[] refs = new BytesRefBuilder[0];
        final ByteArrayDataInput in = new ByteArrayDataInput();

        @Override// ww w .j av a  2s.  c om
        public void setDocument(int docId) {
            final BytesRef bytes = values.get(docId);
            in.reset(bytes.bytes, bytes.offset, bytes.length);
            if (bytes.length == 0) {
                count = 0;
            } else {
                count = in.readVInt();
                if (count > refs.length) {
                    final int previousLength = refs.length;
                    refs = Arrays.copyOf(refs,
                            ArrayUtil.oversize(count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
                    for (int i = previousLength; i < refs.length; ++i) {
                        refs[i] = new BytesRefBuilder();
                    }
                }
                for (int i = 0; i < count; ++i) {
                    final int length = in.readVInt();
                    final BytesRefBuilder scratch = refs[i];
                    scratch.grow(length);
                    in.readBytes(scratch.bytes(), 0, length);
                    scratch.setLength(length);
                }
            }
        }

        @Override
        public int count() {
            return count;
        }

        @Override
        public BytesRef valueAt(int index) {
            return refs[index].get();
        }

    };
}