List of usage examples for org.apache.lucene.store ByteArrayDataInput ByteArrayDataInput
public ByteArrayDataInput()
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);//from w w w. j a va2 s . c o m System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.annots.AnnotationSearchExample.java
License:Apache License
public static void main(String[] args) throws Exception { Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); IndexWriter writer = new IndexWriter(dir, conf); // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the // IndexWriterConfig. Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken")); TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); TokenStream colorAnnotationStream = new AnnotatingTokenFilter( textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM); Document doc = new Document(); doc.add(new TextField("text", textStream)); doc.add(new TextField("annot", colorAnnotationStream)); writer.addDocument(doc);/* w ww .j a va2 s . com*/ writer.close(); DirectoryReader reader = DirectoryReader.open(dir); LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment printFieldTerms(ar, "text"); System.out.println(); final ByteArrayDataInput in = new ByteArrayDataInput(); PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM)); int docID = dape.nextDoc(); int freq = dape.freq(); System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { dape.nextPosition(); BytesRef payload = dape.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } IndexSearcher searcher = new IndexSearcher(reader); System.out.println("\nsearching for 'red WITHIN color':"); Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "red"))); TopDocs td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); System.out.println("\nsearching for 'ate WITHIN color':"); q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "ate"))); td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); reader.close(); dir.close(); }
From source file:com.shaie.annots.AnnotationsUtils.java
License:Apache License
public static void printAnnotations(LeafReader reader, Term term) throws IOException { System.out.println("Annotations for " + term); final ByteArrayDataInput in = new ByteArrayDataInput(); final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS); for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) { final int freq = postings.freq(); System.out.println(" doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { postings.nextPosition();/*ww w. j a v a2s . co m*/ final BytesRef payload = postings.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } } }
From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java
License:Apache License
private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException { ts.reset();//from w w w . jav a 2 s . co m final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); final ByteArrayDataInput in = new ByteArrayDataInput(); int pos = -1; for (final TokenInfo info : infos) { assertThat(ts.incrementToken()).isTrue(); pos += posIncrAtt.getPositionIncrement(); int len = -1; final BytesRef payload = payloadAtt.getPayload(); if (info.len != -1) { assertThat(payload).isNotNull(); in.reset(payload.bytes); len = in.readVInt(); } else { assertThat(payload).isNull(); } assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info); } assertThat(ts.incrementToken()).isFalse(); }
From source file:com.shaie.annots.SpanAnnotationTermQuery.java
License:Apache License
@Override public Spans getSpans(LeafReaderContext context, Bits acceptDocs, Map<Term, TermContext> termContexts) throws IOException { final Spans spans = super.getSpans(context, acceptDocs, termContexts); return new Spans() { private int start, end; final ByteArrayDataInput in = new ByteArrayDataInput(); @Override//from w w w .jav a 2 s .c o m public int start() { return start; } @Override public boolean skipTo(int target) throws IOException { return spans.skipTo(target); } @Override public boolean next() throws IOException { if (!spans.next()) { return false; } if (!isPayloadAvailable()) { return next(); } byte[] payload = getPayload().iterator().next(); in.reset(payload); start = in.readVInt(); end = in.readVInt() + start - 1; // end is inclusive return true; } @Override public boolean isPayloadAvailable() throws IOException { return spans.isPayloadAvailable(); } @Override public Collection<byte[]> getPayload() throws IOException { return spans.getPayload(); } @Override public int end() { return end; } @Override public int doc() { return spans.doc(); } @Override public long cost() { return spans.cost(); } }; }
From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizer.java
License:Apache License
void tokenizeWholeBlock() { queue.clear();//from w w w .jav a2 s .c o m int nextStart = 0; final int end = block.length(); boolean afterSynonymProduced = false; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); for (int idx = 0; idx < synonyms.size(); idx++) { final MyToken synonym = synonyms.get(idx); tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced); // enqueue prev-synonym if (expand) { int limitOffset = 0; if (idx > 0) { limitOffset = synonyms.get(idx - 1).endOffset; } processPrevSynonym(synonym.startOffset, limitOffset); } // enqueue synonyms if (expand) { bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length); final int code = bytesReader.readVInt(); // final boolean keepOrig = (code & 0x1) == 0; // not used final int count = code >>> 1; for (int i = 0; i < count; i++) { map.words.get(bytesReader.readVInt(), scratchBytes); if (scratchChars.chars.length < scratchBytes.length) { scratchChars.chars = new char[scratchBytes.length]; } scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars); final String word = scratchChars.toString(); int posInc = 0, seq = i + 1; if (synonym.word.equals(word)) { posInc = 1; seq = 0; } queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq)); } } else { queue.add(synonym); } // enqueue after-synonym if (expand) { int limitOffset = block.length(); if (idx < synonyms.size() - 1) { limitOffset = synonyms.get(idx + 1).startOffset; } afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset); } nextStart = synonym.endOffset; } tokenizePartialBlock(nextStart, end, afterSynonymProduced); }
From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizer.java
License:Apache License
void tokenizeWholeBlock() { queue.clear();//from w w w. j av a 2 s. c o m int nextStart = 0; int end = block.length(); boolean afterSynonymProduced = false; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); for (int idx = 0; idx < synonyms.size(); idx++) { MyToken synonym = synonyms.get(idx); tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced); // enqueue prev-synonym if (expand) { int limitOffset = 0; if (idx > 0) limitOffset = synonyms.get(idx - 1).endOffset; processPrevSynonym(synonym.startOffset, limitOffset); } // enqueue synonyms if (expand) { bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length); final int code = bytesReader.readVInt(); final int count = code >>> 1; for (int i = 0; i < count; i++) { synonymMap.words.get(bytesReader.readVInt(), scratchBytes); if (scratchChars.chars.length < scratchBytes.length) { scratchChars.chars = new char[scratchBytes.length]; } scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars); String word = scratchChars.toString(); int posInc = 0, seq = i + 1; if (synonym.word.equals(word)) { posInc = 1; seq = 0; } queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq)); } } else { queue.add(synonym); } // enqueue after-synonym if (expand) { int limitOffset = block.length(); if (idx < synonyms.size() - 1) limitOffset = synonyms.get(idx + 1).startOffset; afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset); } nextStart = synonym.endOffset; } tokenizePartialBlock(nextStart, end, afterSynonymProduced); }
From source file:org.elasticsearch.index.fielddata.plain.BinaryDVNumericAtomicFieldData.java
License:Apache License
@Override public LongValues getLongValues() { if (numericType.isFloatingPoint()) { return LongValues.asLongValues(getDoubleValues()); }/*from w w w. j a v a 2s .c o m*/ return new LongValues(true) { final BytesRef bytes = new BytesRef(); final ByteArrayDataInput in = new ByteArrayDataInput(); long[] longs = new long[8]; int i = Integer.MAX_VALUE; int valueCount = 0; @Override public int setDocument(int docId) { values.get(docId, bytes); in.reset(bytes.bytes, bytes.offset, bytes.length); if (!in.eof()) { // first value uses vLong on top of zig-zag encoding, then deltas are encoded using vLong long previousValue = longs[0] = ByteUtils.zigZagDecode(ByteUtils.readVLong(in)); valueCount = 1; while (!in.eof()) { longs = ArrayUtil.grow(longs, valueCount + 1); previousValue = longs[valueCount++] = previousValue + ByteUtils.readVLong(in); } } else { valueCount = 0; } i = 0; return valueCount; } @Override public long nextValue() { assert i < valueCount; return longs[i++]; } }; }
From source file:org.elasticsearch.index.fielddata.plain.BytesBinaryDVAtomicFieldData.java
License:Apache License
@Override public SortedBinaryDocValues getBytesValues() { return new SortedBinaryDocValues() { int count; BytesRefBuilder[] refs = new BytesRefBuilder[0]; final ByteArrayDataInput in = new ByteArrayDataInput(); @Override// ww w .j av a 2s. c om public void setDocument(int docId) { final BytesRef bytes = values.get(docId); in.reset(bytes.bytes, bytes.offset, bytes.length); if (bytes.length == 0) { count = 0; } else { count = in.readVInt(); if (count > refs.length) { final int previousLength = refs.length; refs = Arrays.copyOf(refs, ArrayUtil.oversize(count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)); for (int i = previousLength; i < refs.length; ++i) { refs[i] = new BytesRefBuilder(); } } for (int i = 0; i < count; ++i) { final int length = in.readVInt(); final BytesRefBuilder scratch = refs[i]; scratch.grow(length); in.readBytes(scratch.bytes(), 0, length); scratch.setLength(length); } } } @Override public int count() { return count; } @Override public BytesRef valueAt(int index) { return refs[index].get(); } }; }