List of usage examples for org.apache.lucene.index IndexWriter MAX_TERM_LENGTH
int MAX_TERM_LENGTH
To view the source code for org.apache.lucene.index IndexWriter MAX_TERM_LENGTH.
Click Source Link
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();//from w w w. java2 s.co m MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:org.apache.nifi.provenance.lucene.LuceneUtil.java
License:Apache License
/** * Truncate a single field so that it does not exceed Lucene's byte size limit on indexed terms. * * @param field the string to be indexed * @return a string that can be indexed which is within Lucene's byte size limit, or null if anything goes wrong *///from w w w .jav a 2 s . c om public static String truncateIndexField(String field) { if (field == null) { return field; } Charset charset = Charset.defaultCharset(); byte[] bytes = field.getBytes(charset); if (bytes.length <= IndexWriter.MAX_TERM_LENGTH) { return field; } // chop the field to maximum allowed byte length ByteBuffer bbuf = ByteBuffer.wrap(bytes, 0, IndexWriter.MAX_TERM_LENGTH); try { // decode the chopped byte buffer back into original charset CharsetDecoder decoder = charset.newDecoder(); decoder.onMalformedInput(CodingErrorAction.IGNORE); decoder.reset(); CharBuffer cbuf = decoder.decode(bbuf); return cbuf.toString(); } catch (CharacterCodingException shouldNotHappen) { } // if we get here, something bad has happened return null; }
From source file:org.elasticsearch.xpack.rollup.job.IndexerUtilsTests.java
License:Open Source License
public void testKeyOrderingNewIDLong() { CompositeAggregation composite = mock(CompositeAggregation.class); when(composite.getBuckets()).thenAnswer((Answer<List<CompositeAggregation.Bucket>>) invocationOnMock -> { List<CompositeAggregation.Bucket> foos = new ArrayList<>(); CompositeAggregation.Bucket bucket = mock(CompositeAggregation.Bucket.class); LinkedHashMap<String, Object> keys = new LinkedHashMap<>(3); keys.put("foo.date_histogram", 123L); char[] charArray = new char[IndexWriter.MAX_TERM_LENGTH]; Arrays.fill(charArray, 'a'); keys.put("bar.terms", new String(charArray)); keys.put("abc.histogram", 1.9); keys = shuffleMap(keys, Collections.emptySet()); when(bucket.getKey()).thenReturn(keys); List<Aggregation> list = new ArrayList<>(3); InternalNumericMetricsAggregation.SingleValue mockAgg = mock( InternalNumericMetricsAggregation.SingleValue.class); when(mockAgg.getName()).thenReturn("123"); list.add(mockAgg);// w w w. j a va 2 s . c o m InternalNumericMetricsAggregation.SingleValue mockAgg2 = mock( InternalNumericMetricsAggregation.SingleValue.class); when(mockAgg2.getName()).thenReturn("abc"); list.add(mockAgg2); InternalNumericMetricsAggregation.SingleValue mockAgg3 = mock( InternalNumericMetricsAggregation.SingleValue.class); when(mockAgg3.getName()).thenReturn("yay"); list.add(mockAgg3); Collections.shuffle(list, random()); Aggregations aggs = new Aggregations(list); when(bucket.getAggregations()).thenReturn(aggs); when(bucket.getDocCount()).thenReturn(1L); foos.add(bucket); return foos; }); GroupConfig groupConfig = new GroupConfig(randomDateHistogramGroupConfig(random()), new HistogramGroupConfig(1, "abc"), null); List<IndexRequest> docs = IndexerUtils.processBuckets(composite, "foo", new RollupJobStats(), groupConfig, "foo", true); assertThat(docs.size(), equalTo(1)); assertThat(docs.get(0).id(), equalTo("foo$VAFKZpyaEqYRPLyic57_qw")); }
From source file:org.neo4j.kernel.api.impl.schema.LuceneDocumentStructureTest.java
License:Open Source License
@Test public void tooLongArrayShouldBeSkipped() { byte[] bytes = RandomStringUtils.randomAscii(IndexWriter.MAX_TERM_LENGTH + 10).getBytes(); Document document = LuceneDocumentStructure.documentRepresentingProperty(123, bytes); assertNull(document.getField(Array.key())); }
From source file:org.neo4j.kernel.api.impl.schema.LuceneDocumentStructureTest.java
License:Open Source License
@Test public void stringWithMaximumLengthShouldBeAllowed() { String longestString = RandomStringUtils.randomAscii(IndexWriter.MAX_TERM_LENGTH); Document document = LuceneDocumentStructure.documentRepresentingProperty(123, longestString); assertEquals(longestString, document.getField(String.key()).stringValue()); }