Example usage for org.apache.lucene.index IndexWriter MAX_TERM_LENGTH

List of usage examples for org.apache.lucene.index IndexWriter MAX_TERM_LENGTH

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter MAX_TERM_LENGTH.

Prototype

int MAX_TERM_LENGTH

To view the source code for org.apache.lucene.index IndexWriter MAX_TERM_LENGTH.

Click Source Link

Document

Absolute hard maximum length for a term, in bytes once encoded as UTF8.

Usage

From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java

License:Apache License

@Override
public void testInvertedWrite() throws Exception {
    Directory dir = newDirectory();//from w w  w. java2 s.co  m
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig iwc = newIndexWriterConfig(analyzer);

    // Must be concurrent because thread(s) can be merging
    // while up to one thread flushes, and each of those
    // threads iterates over the map while the flushing
    // thread might be adding to it:
    final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>();

    final AtomicLong sumDocFreq = new AtomicLong();
    final AtomicLong sumTotalTermFreq = new AtomicLong();

    // TODO: would be better to use / delegate to the current
    // Codec returned by getCodec()

    iwc.setCodec(new AssertingCodec() {
        @Override
        public PostingsFormat getPostingsFormatForField(String field) {

            PostingsFormat p = getCodec().postingsFormat();
            if (p instanceof PerFieldPostingsFormat) {
                p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            if (p instanceof RocanaPerFieldPostingsFormat) {
                p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field);
            }
            final PostingsFormat defaultPostingsFormat = p;

            final Thread mainThread = Thread.currentThread();

            if (field.equals("body")) {

                // A PF that counts up some stats and then in
                // the end we verify the stats match what the
                // final IndexReader says, just to exercise the
                // new freedom of iterating the postings more
                // than once at flush/merge:

                return new PostingsFormat(defaultPostingsFormat.getName()) {

                    @Override
                    public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException {

                        final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state);

                        return new FieldsConsumer() {
                            @Override
                            public void write(Fields fields) throws IOException {
                                fieldsConsumer.write(fields);

                                boolean isMerge = state.context.context == IOContext.Context.MERGE;

                                // We only use one thread for flushing
                                // in this test:
                                assert isMerge || Thread.currentThread() == mainThread;

                                // We iterate the provided TermsEnum
                                // twice, so we excercise this new freedom
                                // with the inverted API; if
                                // addOnSecondPass is true, we add up
                                // term stats on the 2nd iteration:
                                boolean addOnSecondPass = random().nextBoolean();

                                //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass);

                                // Gather our own stats:
                                Terms terms = fields.terms("body");
                                assert terms != null;

                                TermsEnum termsEnum = terms.iterator();
                                PostingsEnum docs = null;
                                while (termsEnum.next() != null) {
                                    BytesRef term = termsEnum.term();
                                    // TODO: also sometimes ask for payloads/offsets?
                                    boolean noPositions = random().nextBoolean();
                                    if (noPositions) {
                                        docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                    } else {
                                        docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                    }
                                    int docFreq = 0;
                                    long totalTermFreq = 0;
                                    while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                        docFreq++;
                                        totalTermFreq += docs.freq();
                                        int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                        if (!noPositions) {
                                            for (int i = 0; i < limit; i++) {
                                                docs.nextPosition();
                                            }
                                        }
                                    }

                                    String termString = term.utf8ToString();

                                    // During merge we should only see terms
                                    // we had already seen during a
                                    // previous flush:
                                    assertTrue(isMerge == false || termFreqs.containsKey(termString));

                                    if (isMerge == false) {
                                        if (addOnSecondPass == false) {
                                            TermFreqs tf = termFreqs.get(termString);
                                            if (tf == null) {
                                                tf = new TermFreqs();
                                                termFreqs.put(termString, tf);
                                            }
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        } else if (termFreqs.containsKey(termString) == false) {
                                            // Add placeholder (2nd pass will
                                            // set its counts):
                                            termFreqs.put(termString, new TermFreqs());
                                        }
                                    }
                                }

                                // Also test seeking the TermsEnum:
                                for (String term : termFreqs.keySet()) {
                                    if (termsEnum.seekExact(new BytesRef(term))) {
                                        // TODO: also sometimes ask for payloads/offsets?
                                        boolean noPositions = random().nextBoolean();
                                        if (noPositions) {
                                            docs = termsEnum.postings(docs, PostingsEnum.FREQS);
                                        } else {
                                            docs = termsEnum.postings(null, PostingsEnum.POSITIONS);
                                        }

                                        int docFreq = 0;
                                        long totalTermFreq = 0;
                                        while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                                            docFreq++;
                                            totalTermFreq += docs.freq();
                                            int limit = TestUtil.nextInt(random(), 1, docs.freq());
                                            if (!noPositions) {
                                                for (int i = 0; i < limit; i++) {
                                                    docs.nextPosition();
                                                }
                                            }
                                        }

                                        if (isMerge == false && addOnSecondPass) {
                                            TermFreqs tf = termFreqs.get(term);
                                            assert tf != null;
                                            tf.docFreq += docFreq;
                                            tf.totalTermFreq += totalTermFreq;
                                            sumDocFreq.addAndGet(docFreq);
                                            sumTotalTermFreq.addAndGet(totalTermFreq);
                                        }

                                        //System.out.println("  term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term));
                                        assertTrue(docFreq <= termFreqs.get(term).docFreq);
                                        assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq);
                                    }
                                }

                                // Also test seekCeil
                                for (int iter = 0; iter < 10; iter++) {
                                    BytesRef term = new BytesRef(
                                            TestUtil.randomRealisticUnicodeString(random()));
                                    SeekStatus status = termsEnum.seekCeil(term);
                                    if (status == SeekStatus.NOT_FOUND) {
                                        assertTrue(term.compareTo(termsEnum.term()) < 0);
                                    }
                                }
                            }

                            @Override
                            public void close() throws IOException {
                                fieldsConsumer.close();
                            }
                        };
                    }

                    @Override
                    public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
                        return defaultPostingsFormat.fieldsProducer(state);
                    }
                };
            } else {
                return defaultPostingsFormat;
            }
        }
    });

    RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc);

    LineFileDocs docs = new LineFileDocs(random());
    int bytesToIndex = atLeast(100) * 1024;
    int bytesIndexed = 0;
    while (bytesIndexed < bytesToIndex) {
        Document doc = docs.nextDoc();
        w.addDocument(doc);
        bytesIndexed += RamUsageTester.sizeOf(doc);
    }

    IndexReader r = w.getReader();
    w.close();

    Terms terms = MultiFields.getTerms(r, "body");
    assertEquals(sumDocFreq.get(), terms.getSumDocFreq());
    assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq());

    TermsEnum termsEnum = terms.iterator();
    long termCount = 0;
    boolean supportsOrds = true;
    while (termsEnum.next() != null) {
        BytesRef term = termsEnum.term();
        assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq());
        assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq());
        if (supportsOrds) {
            long ord;
            try {
                ord = termsEnum.ord();
            } catch (UnsupportedOperationException uoe) {
                supportsOrds = false;
                ord = -1;
            }
            if (ord != -1) {
                assertEquals(termCount, ord);
            }
        }
        termCount++;
    }
    assertEquals(termFreqs.size(), termCount);

    r.close();
    dir.close();
}

From source file:org.apache.nifi.provenance.lucene.LuceneUtil.java

License:Apache License

/**
 * Truncate a single field so that it does not exceed Lucene's byte size limit on indexed terms.
 *
 * @param field the string to be indexed
 * @return a string that can be indexed which is within Lucene's byte size limit, or null if anything goes wrong
 *///from w  w  w .jav a  2 s . c  om
public static String truncateIndexField(String field) {
    if (field == null) {
        return field;
    }

    Charset charset = Charset.defaultCharset();
    byte[] bytes = field.getBytes(charset);
    if (bytes.length <= IndexWriter.MAX_TERM_LENGTH) {
        return field;
    }

    // chop the field to maximum allowed byte length
    ByteBuffer bbuf = ByteBuffer.wrap(bytes, 0, IndexWriter.MAX_TERM_LENGTH);

    try {
        // decode the chopped byte buffer back into original charset
        CharsetDecoder decoder = charset.newDecoder();
        decoder.onMalformedInput(CodingErrorAction.IGNORE);
        decoder.reset();
        CharBuffer cbuf = decoder.decode(bbuf);
        return cbuf.toString();
    } catch (CharacterCodingException shouldNotHappen) {
    }

    // if we get here, something bad has happened
    return null;
}

From source file:org.elasticsearch.xpack.rollup.job.IndexerUtilsTests.java

License:Open Source License

public void testKeyOrderingNewIDLong() {
    CompositeAggregation composite = mock(CompositeAggregation.class);

    when(composite.getBuckets()).thenAnswer((Answer<List<CompositeAggregation.Bucket>>) invocationOnMock -> {
        List<CompositeAggregation.Bucket> foos = new ArrayList<>();

        CompositeAggregation.Bucket bucket = mock(CompositeAggregation.Bucket.class);
        LinkedHashMap<String, Object> keys = new LinkedHashMap<>(3);
        keys.put("foo.date_histogram", 123L);

        char[] charArray = new char[IndexWriter.MAX_TERM_LENGTH];
        Arrays.fill(charArray, 'a');
        keys.put("bar.terms", new String(charArray));
        keys.put("abc.histogram", 1.9);
        keys = shuffleMap(keys, Collections.emptySet());
        when(bucket.getKey()).thenReturn(keys);

        List<Aggregation> list = new ArrayList<>(3);
        InternalNumericMetricsAggregation.SingleValue mockAgg = mock(
                InternalNumericMetricsAggregation.SingleValue.class);
        when(mockAgg.getName()).thenReturn("123");
        list.add(mockAgg);// w w w.  j a  va  2 s  . c  o m

        InternalNumericMetricsAggregation.SingleValue mockAgg2 = mock(
                InternalNumericMetricsAggregation.SingleValue.class);
        when(mockAgg2.getName()).thenReturn("abc");
        list.add(mockAgg2);

        InternalNumericMetricsAggregation.SingleValue mockAgg3 = mock(
                InternalNumericMetricsAggregation.SingleValue.class);
        when(mockAgg3.getName()).thenReturn("yay");
        list.add(mockAgg3);

        Collections.shuffle(list, random());

        Aggregations aggs = new Aggregations(list);
        when(bucket.getAggregations()).thenReturn(aggs);
        when(bucket.getDocCount()).thenReturn(1L);

        foos.add(bucket);

        return foos;
    });

    GroupConfig groupConfig = new GroupConfig(randomDateHistogramGroupConfig(random()),
            new HistogramGroupConfig(1, "abc"), null);
    List<IndexRequest> docs = IndexerUtils.processBuckets(composite, "foo", new RollupJobStats(), groupConfig,
            "foo", true);
    assertThat(docs.size(), equalTo(1));
    assertThat(docs.get(0).id(), equalTo("foo$VAFKZpyaEqYRPLyic57_qw"));
}

From source file:org.neo4j.kernel.api.impl.schema.LuceneDocumentStructureTest.java

License:Open Source License

@Test
public void tooLongArrayShouldBeSkipped() {
    byte[] bytes = RandomStringUtils.randomAscii(IndexWriter.MAX_TERM_LENGTH + 10).getBytes();
    Document document = LuceneDocumentStructure.documentRepresentingProperty(123, bytes);
    assertNull(document.getField(Array.key()));
}

From source file:org.neo4j.kernel.api.impl.schema.LuceneDocumentStructureTest.java

License:Open Source License

@Test
public void stringWithMaximumLengthShouldBeAllowed() {
    String longestString = RandomStringUtils.randomAscii(IndexWriter.MAX_TERM_LENGTH);
    Document document = LuceneDocumentStructure.documentRepresentingProperty(123, longestString);
    assertEquals(longestString, document.getField(String.key()).stringValue());
}