List of usage examples for org.apache.lucene.codecs FieldsConsumer FieldsConsumer
protected FieldsConsumer()
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();// w w w. j a v a 2 s . c om MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:org.codelibs.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override//www . j av a2 s . c om public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); CodecUtil.writeFooter(output); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } terms.iterator(); new SuggestPayload(); throw new UnsupportedOperationException("QueryBuilders does not support this operation."); // final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( // maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); // int docCount = 0; // while (true) { // BytesRef term = termsEnum.next(); // if (term == null) { // break; // } // docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS); // builder.startTerm(term); // int docFreq = 0; // while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { // for (int i = 0; i < docsEnum.freq(); i++) { // final int position = docsEnum.nextPosition(); // AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); // builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // // multi fields have the same surface form so we sum up here // maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); // } // docFreq++; // docCount = Math.max(docCount, docsEnum.docID()+1); // } // builder.finishTerm(docFreq); // } // /* // * Here we are done processing the field and we can // * buid the FST and write it to disk. // */ // FST<Pair<Long, BytesRef>> build = builder.build(); // assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; // /* // * it's possible that the FST is null if we have 2 segments that get merged // * and all docs that have a value in this field are deleted. This will cause // * a consumer to be created but it doesn't consume any values causing the FSTBuilder // * to return null. // */ // if (build != null) { // fieldOffsets.put(field, output.getFilePointer()); // build.save(output); // /* write some more meta-info */ // output.writeVInt(maxAnalyzedPathsForOneInput); // output.writeVInt(maxSurfaceFormsPerAnalyzedForm); // output.writeInt(maxGraphExpansions); // can be negative // int options = 0; // options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; // options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; // options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; // output.writeVInt(options); // output.writeVInt(XAnalyzingSuggester.SEP_LABEL); // output.writeVInt(XAnalyzingSuggester.END_BYTE); // output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); // output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); // } } } }; }
From source file:org.elasticsearch.index.codec.postingformat.Elasticsearch090RWPostingsFormat.java
License:Apache License
@Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { final PostingsFormat delegate = getDefaultWrapped(); final BloomFilteredFieldsConsumer fieldsConsumer = new BloomFilterPostingsFormat(delegate, BloomFilter.Factory.DEFAULT) { @Override/*from w ww .j a v a 2 s. c o m*/ public BloomFilteredFieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { return new BloomFilteredFieldsConsumer(delegate.fieldsConsumer(state), state, delegate); } }.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { Fields maskedFields = new FilterLeafReader.FilterFields(fields) { @Override public Iterator<String> iterator() { return Iterators.filter(this.in.iterator(), Predicates.not(UID_FIELD_FILTER)); } }; fieldsConsumer.getDelegate().write(maskedFields); maskedFields = new FilterLeafReader.FilterFields(fields) { @Override public Iterator<String> iterator() { return Iterators.singletonIterator(UidFieldMapper.NAME); } }; // only go through bloom for the UID field fieldsConsumer.write(maskedFields); } @Override public void close() throws IOException { fieldsConsumer.close(); } }; }
From source file:org.elasticsearch.index.codec.postingsformat.Elasticsearch090PostingsFormat.java
License:Apache License
@Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { final BloomFilteredFieldsConsumer fieldsConsumer = bloomPostings.fieldsConsumer(state); return new FieldsConsumer() { @Override/*from w ww.j a v a2 s . c o m*/ public void close() throws IOException { fieldsConsumer.close(); } @Override public TermsConsumer addField(FieldInfo field) throws IOException { if (UidFieldMapper.NAME.equals(field.name)) { // only go through bloom for the UID field return fieldsConsumer.addField(field); } return fieldsConsumer.getDelegate().addField(field); } }; }
From source file:org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>(); @Override/*from w w w.ja va2 s .c o m*/ public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<FieldInfo, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey().name); output.writeVLong(entry.getValue()); } output.writeLong(pointer); output.flush(); } finally { IOUtils.close(output); } } @Override public TermsConsumer addField(final FieldInfo field) throws IOException { return new TermsConsumer() { final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer( AnalyzingCompletionLookupProvider.this, builder); @Override public PostingsConsumer startTerm(BytesRef text) throws IOException { builder.startTerm(text); return postingsConsumer; } @Override public Comparator<BytesRef> getComparator() throws IOException { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { builder.finishTerm(stats.docFreq); // use doc freq as a fallback } @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(postingsConsumer.getMaxAnalyzedPathsForOneInput()); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPERATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); output.writeVInt(XAnalyzingSuggester.SEP_LABEL); output.writeVInt(XAnalyzingSuggester.END_BYTE); output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); } } }; } }; }
From source file:org.elasticsearch.search.suggest.completion.AnalyzingCompletionLookupProviderV1.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION); return new FieldsConsumer() { private Map<FieldInfo, Long> fieldOffsets = new HashMap<FieldInfo, Long>(); @Override/*from w w w . j av a 2 s . c o m*/ public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<FieldInfo, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey().name); output.writeVLong(entry.getValue()); } output.writeLong(pointer); output.flush(); } finally { IOUtils.close(output); } } @Override public TermsConsumer addField(final FieldInfo field) throws IOException { return new TermsConsumer() { final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, PAYLOAD_SEP); final CompletionPostingsConsumer postingsConsumer = new CompletionPostingsConsumer( AnalyzingCompletionLookupProviderV1.this, builder); @Override public PostingsConsumer startTerm(BytesRef text) throws IOException { builder.startTerm(text); return postingsConsumer; } @Override public Comparator<BytesRef> getComparator() throws IOException { return BytesRef.getUTF8SortedAsUnicodeComparator(); } @Override public void finishTerm(BytesRef text, TermStats stats) throws IOException { builder.finishTerm(stats.docFreq); // use doc freq as a fallback } @Override public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) throws IOException { /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(postingsConsumer.getMaxAnalyzedPathsForOneInput()); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPERATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); } } }; } }; }
From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override/*from w ww.j a v a 2 s .c om*/ public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); CodecUtil.writeFooter(output); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); output.writeVInt(XAnalyzingSuggester.SEP_LABEL); output.writeVInt(XAnalyzingSuggester.END_BYTE); output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); } } } }; }
From source file:org.elasticsearch.search.suggest.completion.old.AnalyzingCompletionLookupProviderV1.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { // TODO write index header? CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override// w ww . j ava 2 s. c o m public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(null, docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProviderV1.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); } } } }; }
From source file:org.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override/*w w w . ja v a2 s . co m*/ public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); CodecUtil.writeFooter(output); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } TermsEnum termsEnum = terms.iterator(); PostingsEnum docsEnum = null; final SuggestPayload spare = new SuggestPayload(); int maxAnalyzedPathsForOneInput = 0; final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); int docCount = 0; while (true) { BytesRef term = termsEnum.next(); if (term == null) { break; } docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS); builder.startTerm(term); int docFreq = 0; while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { for (int i = 0; i < docsEnum.freq(); i++) { final int position = docsEnum.nextPosition(); AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // multi fields have the same surface form so we sum up here maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); } docFreq++; docCount = Math.max(docCount, docsEnum.docID() + 1); } builder.finishTerm(docFreq); } /* * Here we are done processing the field and we can * buid the FST and write it to disk. */ FST<Pair<Long, BytesRef>> build = builder.build(); assert build != null || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; /* * it's possible that the FST is null if we have 2 segments that get merged * and all docs that have a value in this field are deleted. This will cause * a consumer to be created but it doesn't consume any values causing the FSTBuilder * to return null. */ if (build != null) { fieldOffsets.put(field, output.getFilePointer()); build.save(output); /* write some more meta-info */ output.writeVInt(maxAnalyzedPathsForOneInput); output.writeVInt(maxSurfaceFormsPerAnalyzedForm); output.writeInt(maxGraphExpansions); // can be negative int options = 0; options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; output.writeVInt(options); output.writeVInt(XAnalyzingSuggester.SEP_LABEL); output.writeVInt(XAnalyzingSuggester.END_BYTE); output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); } } } }; }