List of usage examples for org.apache.lucene.store ByteArrayDataInput readVInt
@Override public int readVInt()
From source file:com.shaie.annots.AnnotatingTokenStreamExample.java
License:Apache License
public static void main(String[] args) throws Exception { String text = "quick brown fox ate the blue red chicken"; Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); TeeSinkTokenFilter teeSink = new TeeSinkTokenFilter(tokenizer); TokenStream colors = new AnnotatingTokenFilter(teeSink.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM);/*from ww w . j a v a 2s. co m*/ System.out.println("Text tokens:\n"); // consume all the tokens from the original stream. this also populates the // Sink (colors) with its color-matching tokens teeSink.reset(); CharTermAttribute termAtt = teeSink.getAttribute(CharTermAttribute.class); PositionIncrementAttribute termPosAtt = teeSink.getAttribute(PositionIncrementAttribute.class); int termsPos = -1; while (teeSink.incrementToken()) { termsPos += termPosAtt.getPositionIncrement(); System.out.println("term=" + termAtt + ", pos=" + termsPos); } teeSink.end(); tokenizer.end(); System.out.println("\nAnnotation tokens:\n"); // now consume the color annotation tokens from the colors stream CharTermAttribute colorAtt = colors.getAttribute(CharTermAttribute.class); PayloadAttribute payloadAtt = colors.getAttribute(PayloadAttribute.class); ByteArrayDataInput in = new ByteArrayDataInput(); colors.reset(); while (colors.incrementToken()) { BytesRef bytes = payloadAtt.getPayload(); in.reset(bytes.bytes, bytes.offset, bytes.length); System.out.println("term=" + colorAtt + ", start=" + in.readVInt() + ", length=" + in.readVInt()); } colors.end(); colors.close(); teeSink.close(); tokenizer.close(); }
From source file:com.shaie.annots.AnnotationSearchExample.java
License:Apache License
public static void main(String[] args) throws Exception { Directory dir = new RAMDirectory(); IndexWriterConfig conf = new IndexWriterConfig(new WhitespaceAnalyzer()); IndexWriter writer = new IndexWriter(dir, conf); // we need to add the annotation as a TokenStream field, therefore cannot use an Analyzer passed in the // IndexWriterConfig. Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("quick brown fox ate the blue red chicken")); TeeSinkTokenFilter textStream = new TeeSinkTokenFilter(tokenizer); TokenStream colorAnnotationStream = new AnnotatingTokenFilter( textStream.newSinkTokenStream(new ColorsSinkFilter()), COLOR_ANNOT_TERM); Document doc = new Document(); doc.add(new TextField("text", textStream)); doc.add(new TextField("annot", colorAnnotationStream)); writer.addDocument(doc);/*from w w w . ja v a2 s .c o m*/ writer.close(); DirectoryReader reader = DirectoryReader.open(dir); LeafReader ar = reader.leaves().get(0).reader(); // we only have one segment printFieldTerms(ar, "text"); System.out.println(); final ByteArrayDataInput in = new ByteArrayDataInput(); PostingsEnum dape = ar.postings(new Term("annot", COLOR_ANNOT_TERM)); int docID = dape.nextDoc(); int freq = dape.freq(); System.out.println("Color annotation spans: doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { dape.nextPosition(); BytesRef payload = dape.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } IndexSearcher searcher = new IndexSearcher(reader); System.out.println("\nsearching for 'red WITHIN color':"); Query q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "red"))); TopDocs td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); System.out.println("\nsearching for 'ate WITHIN color':"); q = new SpanWithinQuery(new SpanAnnotationTermQuery(new Term("annot", COLOR_ANNOT_TERM)), new SpanInclusivePositionTermQuery(new Term("text", "ate"))); td = searcher.search(q, 10); System.out.println(" num results: " + td.scoreDocs.length); reader.close(); dir.close(); }
From source file:com.shaie.annots.AnnotationsUtils.java
License:Apache License
public static void printAnnotations(LeafReader reader, Term term) throws IOException { System.out.println("Annotations for " + term); final ByteArrayDataInput in = new ByteArrayDataInput(); final PostingsEnum postings = reader.postings(term, PostingsEnum.PAYLOADS); for (int docID = postings.nextDoc(); docID != DocIdSetIterator.NO_MORE_DOCS; docID = postings.nextDoc()) { final int freq = postings.freq(); System.out.println(" doc=" + docID + ", freq=" + freq); for (int i = 0; i < freq; i++) { postings.nextPosition();//from w w w .j ava 2 s.co m final BytesRef payload = postings.getPayload(); in.reset(payload.bytes, payload.offset, payload.length); System.out.println(" start=" + in.readVInt() + ", length=" + in.readVInt()); } } }
From source file:com.shaie.annots.filter.PreAnnotatedTokenFilterTest.java
License:Apache License
private static void assertTokenInfos(TokenStream ts, TokenInfo... infos) throws IOException { ts.reset();//from w w w. ja v a 2 s . c o m final CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); final PayloadAttribute payloadAtt = ts.addAttribute(PayloadAttribute.class); final ByteArrayDataInput in = new ByteArrayDataInput(); int pos = -1; for (final TokenInfo info : infos) { assertThat(ts.incrementToken()).isTrue(); pos += posIncrAtt.getPositionIncrement(); int len = -1; final BytesRef payload = payloadAtt.getPayload(); if (info.len != -1) { assertThat(payload).isNotNull(); in.reset(payload.bytes); len = in.readVInt(); } else { assertThat(payload).isNull(); } assertThat(new TokenInfo(term.toString(), pos, len)).isEqualTo(info); } assertThat(ts.incrementToken()).isFalse(); }
From source file:jp.sf.fess.solr.plugin.analysis.synonym.NGramSynonymTokenizer.java
License:Apache License
void tokenizeWholeBlock() { queue.clear();//w w w .j ava2 s .com int nextStart = 0; final int end = block.length(); boolean afterSynonymProduced = false; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); for (int idx = 0; idx < synonyms.size(); idx++) { final MyToken synonym = synonyms.get(idx); tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced); // enqueue prev-synonym if (expand) { int limitOffset = 0; if (idx > 0) { limitOffset = synonyms.get(idx - 1).endOffset; } processPrevSynonym(synonym.startOffset, limitOffset); } // enqueue synonyms if (expand) { bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length); final int code = bytesReader.readVInt(); // final boolean keepOrig = (code & 0x1) == 0; // not used final int count = code >>> 1; for (int i = 0; i < count; i++) { map.words.get(bytesReader.readVInt(), scratchBytes); if (scratchChars.chars.length < scratchBytes.length) { scratchChars.chars = new char[scratchBytes.length]; } scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars); final String word = scratchChars.toString(); int posInc = 0, seq = i + 1; if (synonym.word.equals(word)) { posInc = 1; seq = 0; } queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq)); } } else { queue.add(synonym); } // enqueue after-synonym if (expand) { int limitOffset = block.length(); if (idx < synonyms.size() - 1) { limitOffset = synonyms.get(idx + 1).startOffset; } afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset); } nextStart = synonym.endOffset; } tokenizePartialBlock(nextStart, end, afterSynonymProduced); }
From source file:org.codelibs.elasticsearch.synonym.analysis.NGramSynonymTokenizer.java
License:Apache License
void tokenizeWholeBlock() { queue.clear();//from w w w .j av a 2s. c om int nextStart = 0; int end = block.length(); boolean afterSynonymProduced = false; final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); for (int idx = 0; idx < synonyms.size(); idx++) { MyToken synonym = synonyms.get(idx); tokenizePartialBlock(nextStart, synonym.startOffset, afterSynonymProduced); // enqueue prev-synonym if (expand) { int limitOffset = 0; if (idx > 0) limitOffset = synonyms.get(idx - 1).endOffset; processPrevSynonym(synonym.startOffset, limitOffset); } // enqueue synonyms if (expand) { bytesReader.reset(synonym.output.bytes, synonym.output.offset, synonym.output.length); final int code = bytesReader.readVInt(); final int count = code >>> 1; for (int i = 0; i < count; i++) { synonymMap.words.get(bytesReader.readVInt(), scratchBytes); if (scratchChars.chars.length < scratchBytes.length) { scratchChars.chars = new char[scratchBytes.length]; } scratchChars.length = UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars.chars); String word = scratchChars.toString(); int posInc = 0, seq = i + 1; if (synonym.word.equals(word)) { posInc = 1; seq = 0; } queue.add(new MyToken(word, synonym.startOffset, synonym.endOffset, posInc, seq)); } } else { queue.add(synonym); } // enqueue after-synonym if (expand) { int limitOffset = block.length(); if (idx < synonyms.size() - 1) limitOffset = synonyms.get(idx + 1).startOffset; afterSynonymProduced = processAfterSynonym(synonym.endOffset, limitOffset); } nextStart = synonym.endOffset; } tokenizePartialBlock(nextStart, end, afterSynonymProduced); }