Example usage for org.apache.lucene.store ByteArrayDataInput reset

List of usage examples for org.apache.lucene.store ByteArrayDataInput reset

Introduction

In this page you can find the example usage for org.apache.lucene.store ByteArrayDataInput reset.

Prototype

public void reset(byte[] bytes, int offset, int len) 

Source Link

Usage

From source file:com.shaie.annots.AnnotatingTokenStreamExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    String text = "quick brown fox ate the blue red chicken";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(text));
    TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer);
    TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()),
            COLOR_ANNOT_TERM);//www .  j a  v a 2  s .co m

    System.out.println("Text tokens:\n");

    // consume all the tokens from the original stream. this also populates the
    // Sink (colors) with its color-matching tokens
    teeSink.reset();
    CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class);
    int termsPos = -1;
    while (teeSink.incrementToken()) {
        termsPos += termPosAtt.getPositionIncrement();
        System.out.println("term=" + termAtt + ", pos=" + termsPos);
    }
    teeSink.end();
    tokenizer.end();

    System.out.println("\nAnnotation tokens:\n");

    // now consume the color annotation tokens from the colors stream
    CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class);
    PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class);
    ByteArrayDataInput in = new ByteArrayDataInput();
    colors.reset();
    while (colors.incrementToken()) {
        BytesRef bytes = payloadAtt.getPayload();
        in.reset(bytes.bytes, bytes.offset, bytes.length);
        System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt());
    }
    colors.end();
    colors.close();

    teeSink.close();
    tokenizer.close();
}

From source file:com.shaie.annots.AnnotationSearchExample.java

License:Apache License

public static void main(String[] args) throws Exception {
    Directory dir = new RAMDirectory();
    IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer());
    IndexWriter writer = new IndexWriter(dir, conf);

    // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the
    // IndexWriterConfig.
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken"));
    TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer);
    TokenStream colorAnnotationStream = new AnnotatingTokenFilter(
            textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);

    Document doc = new Document();
    doc.add(new TextField("text", textStream));
    doc.add(new TextField("annot", colorAnnotationStream));
    writer.addDocument(doc);/*from  w w w.  ja v  a 2  s.c  o  m*/

    writer.close();

    DirectoryReader reader = DirectoryReader.open(dir);
    LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment
    printFieldTerms(ar, "text");
    System.out.println();

    final ByteArrayDataInput in = new ByteArrayDataInput();
    PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM));
    int docID = dape.nextDoc();
    int freq = dape.freq();
    System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq);
    for (int i = 0; i < freq; i++) {
        dape.nextPosition();
        BytesRef payload = dape.getPayload();
        in.reset(payload.bytes, payload.offset, payload.length);
        System.out.println("  start=" + in.readVInt() + ", length=" + in.readVInt());
    }

    IndexSearcher searcher = new IndexSearcher(reader);

    System.out.println("\nsearching for 'red WITHIN color':");
    Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "red")));
    TopDocs td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    System.out.println("\nsearching for 'ate WITHIN color':");
    q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)),
            new SpanInclusivePositionTermQuery(new Term("text", "ate")));
    td = searcher.search(q, 10);
    System.out.println("  num results: " + td.scoreDocs.length);

    reader.close();
    dir.close();
}

From source file:com.shaie.annots.AnnotationsUtils.java

License:Apache License

public static void printAnnotations(LeafReader reader, Term term) throws IOException {
    System.out.println("Annotations for " + term);
    final ByteArrayDataInput in = new ByteArrayDataInput();
    final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS);
    for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) {
        final int freq = postings.freq();
        System.out.println("  doc=" + docID + ", freq=" + freq);
        for (int i = 0; i < freq; i++) {
            postings.nextPosition();/*  www.  j  av  a  2s  .co m*/
            final BytesRef payload = postings.getPayload();
            in.reset(payload.bytes, payload.offset, payload.length);
            System.out.println("    start=" + in.readVInt() + ", length=" + in.readVInt());
        }
    }
}

From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizer.java

License:Apache License

void tokenizeWholeBlock() {
    queue.clear();//  www  . j av  a  2  s  .  com
    int nextStart = 0;
    final int end = block.length();
    boolean afterSynonymProduced = false;
    final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
    for (int idx = 0; idx < synonyms.size(); idx++) {
        final MyToken synonym = synonyms.get(idx);
        tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

        // enqueue prev-synonym
        if (expand) {
            int limitOffset = 0;
            if (idx > 0) {
                limitOffset = synonyms.get(idx - 1).endOffset;
            }
            processPrevSynonym(synonym.startOffset, limitOffset);
        }

        // enqueue synonyms
        if (expand) {
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            // final boolean keepOrig = (code & 0x1) == 0; // not used
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                map.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                final String word = scratchChars.toString();
                int posInc = 0, seq = i + 1;
                if (synonym.word.equals(word)) {
                    posInc = 1;
                    seq = 0;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
            }
        } else {
            queue.add(synonym);
        }

        // enqueue after-synonym
        if (expand) {
            int limitOffset = block.length();
            if (idx < synonyms.size() - 1) {
                limitOffset = synonyms.get(idx + 1).startOffset;
            }
            afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset);
        }

        nextStart = synonym.endOffset;
    }
    tokenizePartialBlock(nextStart, end, afterSynonymProduced);
}

From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizer.java

License:Apache License

void tokenizeWholeBlock() {
    queue.clear();//  w  w  w.  jav  a  2 s .co m
    int nextStart = 0;
    int end = block.length();
    boolean afterSynonymProduced = false;
    final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
    for (int idx = 0; idx < synonyms.size(); idx++) {
        MyToken synonym = synonyms.get(idx);
        tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced);

        // enqueue prev-synonym
        if (expand) {
            int limitOffset = 0;
            if (idx > 0)
                limitOffset = synonyms.get(idx - 1).endOffset;
            processPrevSynonym(synonym.startOffset, limitOffset);
        }

        // enqueue synonyms
        if (expand) {
            bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length);
            final int code = bytesReader.readVInt();
            final int count = code >>> 1;
            for (int i = 0; i < count; i++) {
                synonymMap.words.get(bytesReader.readVInt(), scratchBytes);
                if (scratchChars.chars.length < scratchBytes.length) {
                    scratchChars.chars = new char[scratchBytes.length];
                }
                scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars);
                String word = scratchChars.toString();
                int posInc = 0, seq = i + 1;
                if (synonym.word.equals(word)) {
                    posInc = 1;
                    seq = 0;
                }
                queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq));
            }
        } else {
            queue.add(synonym);
        }

        // enqueue after-synonym
        if (expand) {
            int limitOffset = block.length();
            if (idx < synonyms.size() - 1)
                limitOffset = synonyms.get(idx + 1).startOffset;
            afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset);
        }

        nextStart = synonym.endOffset;
    }
    tokenizePartialBlock(nextStart, end, afterSynonymProduced);
}