List of usage examples for org.apache.lucene.index LeafReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:com.meizu.nlp.classification.utils.DatasetSplitter.java
License:Apache License
/** * Split a given index into 3 indexes for training, test and cross validation tasks respectively * * @param originalIndex an {@link org.apache.lucene.index.LeafReader} on the source index * @param trainingIndex a {@link Directory} used to write the training index * @param testIndex a {@link Directory} used to write the test index * @param crossValidationIndex a {@link Directory} used to write the cross validation index * @param analyzer {@link Analyzer} used to create the new docs * @param fieldNames names of fields that need to be put in the new indexes or <code>null</code> if all should be used * @throws IOException if any writing operation fails on any of the indexes *///from w w w . j av a 2s.c o m public void split(LeafReader originalIndex, Directory trainingIndex, Directory testIndex, Directory crossValidationIndex, Analyzer analyzer, String... fieldNames) throws IOException { // create IWs for train / test / cv IDXs IndexWriter testWriter = new IndexWriter(testIndex, new IndexWriterConfig(analyzer)); IndexWriter cvWriter = new IndexWriter(crossValidationIndex, new IndexWriterConfig(analyzer)); IndexWriter trainingWriter = new IndexWriter(trainingIndex, new IndexWriterConfig(analyzer)); try { int size = originalIndex.maxDoc(); IndexSearcher indexSearcher = new IndexSearcher(originalIndex); TopDocs topDocs = indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE); // set the type to be indexed, stored, with term vectors FieldType ft = new FieldType(TextField.TYPE_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); int b = 0; // iterate over existing documents for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // create a new document for indexing Document doc = new Document(); if (fieldNames != null && fieldNames.length > 0) { for (String fieldName : fieldNames) { doc.add(new Field(fieldName, originalIndex.document(scoreDoc.doc).getField(fieldName).stringValue(), ft)); } } else { for (IndexableField storableField : originalIndex.document(scoreDoc.doc).getFields()) { if (storableField.readerValue() != null) { doc.add(new Field(storableField.name(), storableField.readerValue(), ft)); } else if (storableField.binaryValue() != null) { doc.add(new Field(storableField.name(), storableField.binaryValue(), ft)); } else if (storableField.stringValue() != null) { doc.add(new Field(storableField.name(), storableField.stringValue(), ft)); } else if (storableField.numericValue() != null) { doc.add(new Field(storableField.name(), storableField.numericValue().toString(), ft)); } } } // add it to one of the IDXs if (b % 2 == 0 && testWriter.maxDoc() < size * testRatio) { testWriter.addDocument(doc); } else if (cvWriter.maxDoc() < size * crossValidationRatio) { cvWriter.addDocument(doc); } else { trainingWriter.addDocument(doc); } b++; } } catch (Exception e) { throw new IOException(e); } finally { testWriter.commit(); cvWriter.commit(); trainingWriter.commit(); // close IWs testWriter.close(); cvWriter.close(); trainingWriter.close(); } }
From source file:org.apache.solr.handler.component.AlfrescoLukeRequestHandler.java
License:Open Source License
protected static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException { TermsEnum termsEnum = terms.iterator(); if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; }//w ww .j a va 2 s. c o m PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); final Bits liveDocs = reader.getLiveDocs(); if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS || (liveDocs != null && liveDocs.get(postingsEnum.docID()))) { return null; } return reader.document(postingsEnum.docID()); }
From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java
License:Apache License
public void testIndexAndAddDocValues() throws Exception { Random rand = random();/* ww w . j a va2 s . com*/ for (int i = 0; i < 100; i++) { assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i))); if (rand.nextBoolean()) { assertU(commit()); } } assertU(commit()); // Assert everything has been indexed and there are no docvalues withNewRawReader(h, topReader -> { assertEquals(100, topReader.numDocs()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should not have docValues yet assertEquals(DocValuesType.NONE, infos.fieldInfo(TEST_FIELD).getDocValuesType()); }); addDocValuesTo(h, TEST_FIELD); // Add some more documents with doc values turned on including updating some for (int i = 90; i < 110; i++) { assertU(adoc(ID_FIELD, String.valueOf(i), TEST_FIELD, String.valueOf(i))); if (rand.nextBoolean()) { assertU(commit()); } } assertU(commit()); withNewRawReader(h, topReader -> { assertEquals(110, topReader.numDocs()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType()); }); int optimizeSegments = 1; assertU(optimize("maxSegments", String.valueOf(optimizeSegments))); // Assert all docs have the right docvalues withNewRawReader(h, topReader -> { // Assert merged into one segment assertEquals(110, topReader.numDocs()); assertEquals(optimizeSegments, topReader.leaves().size()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType()); // Check that all segments have the right docvalues type with the correct value // Also check that other fields (e.g. the id field) didn't mistakenly get docvalues added for (LeafReaderContext ctx : topReader.leaves()) { LeafReader r = ctx.reader(); SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD); for (int i = 0; i < r.numDocs(); ++i) { Document doc = r.document(i); String v = doc.getField(TEST_FIELD).stringValue(); String id = doc.getField(ID_FIELD).stringValue(); assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType()); assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType()); assertEquals(v, id); docvalues.nextDoc(); assertEquals(v, docvalues.binaryValue().utf8ToString()); } } }); }
From source file:org.apache.solr.index.UninvertDocValuesMergePolicyTest.java
License:Apache License
public void testNonIndexedFieldDoesNonFail() throws Exception { // Remove Indexed from fieldType removeIndexFrom(h, TEST_FIELD);/*from w w w .ja v a 2 s .c om*/ assertU(adoc(ID_FIELD, String.valueOf(1), TEST_FIELD, String.valueOf(1))); assertU(commit()); addDocValuesTo(h, TEST_FIELD); assertU(adoc(ID_FIELD, String.valueOf(2), TEST_FIELD, String.valueOf(2))); assertU(commit()); assertU(optimize("maxSegments", "1")); withNewRawReader(h, topReader -> { // Assert merged into one segment assertEquals(2, topReader.numDocs()); assertEquals(1, topReader.leaves().size()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); // The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(TEST_FIELD).getDocValuesType()); for (LeafReaderContext ctx : topReader.leaves()) { LeafReader r = ctx.reader(); SortedDocValues docvalues = r.getSortedDocValues(TEST_FIELD); for (int i = 0; i < r.numDocs(); ++i) { Document doc = r.document(i); String v = doc.getField(TEST_FIELD).stringValue(); String id = doc.getField(ID_FIELD).stringValue(); assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(TEST_FIELD).getDocValuesType()); assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(ID_FIELD).getDocValuesType()); if (id.equals("2")) { assertTrue(docvalues.advanceExact(i)); assertEquals(v, docvalues.binaryValue().utf8ToString()); } else { assertFalse(docvalues.advanceExact(i)); } } } }); }
From source file:org.apache.solr.schema.TestHalfAndHalfDocValues.java
License:Apache License
public void testHalfAndHalfDocValues() throws Exception { // Insert two docs without docvalues String fieldname = "string_add_dv_later"; assertU(adoc("id", "3", fieldname, "c")); assertU(commit());// www .j a v a 2s . c o m assertU(adoc("id", "1", fieldname, "a")); assertU(commit()); try (SolrCore core = h.getCoreInc()) { assertFalse(core.getLatestSchema().getField(fieldname).hasDocValues()); // Add docvalues to the field type IndexSchema schema = core.getLatestSchema(); SchemaField oldField = schema.getField(fieldname); int newProperties = oldField.getProperties() | SchemaField.DOC_VALUES; SchemaField sf = new SchemaField(fieldname, oldField.getType(), newProperties, null); schema.getFields().put(fieldname, sf); // Insert a new doc with docvalues assertU(adoc("id", "2", fieldname, "b")); assertU(commit()); // Check there are a mix of segments with and without docvalues final RefCounted<SolrIndexSearcher> searcherRef = core.openNewSearcher(true, true); final SolrIndexSearcher searcher = searcherRef.get(); try { final DirectoryReader topReader = searcher.getRawReader(); //Assert no merges assertEquals(3, topReader.numDocs()); assertEquals(3, topReader.leaves().size()); final FieldInfos infos = MultiFields.getMergedFieldInfos(topReader); //The global field type should have docValues because a document with dvs was added assertEquals(DocValuesType.SORTED, infos.fieldInfo(fieldname).getDocValuesType()); for (LeafReaderContext ctx : topReader.leaves()) { LeafReader r = ctx.reader(); //Make sure there were no merges assertEquals(1, r.numDocs()); Document doc = r.document(0); String id = doc.getField("id").stringValue(); if (id.equals("1") || id.equals("3")) { assertEquals(DocValuesType.NONE, r.getFieldInfos().fieldInfo(fieldname).getDocValuesType()); } else { assertEquals(DocValuesType.SORTED, r.getFieldInfos().fieldInfo(fieldname).getDocValuesType()); } } } finally { searcherRef.decref(); } } // Assert sort order is correct assertQ(req("q", "string_add_dv_later:*", "sort", "string_add_dv_later asc"), "//*[@numFound='3']", "//result/doc[1]/int[@name='id'][.=1]", "//result/doc[2]/int[@name='id'][.=2]", "//result/doc[3]/int[@name='id'][.=3]"); }
From source file:org.apache.solr.schema.TestPointFields.java
License:Apache License
private void doTestInternals(String field, String[] values) throws IOException { assertTrue(h.getCore().getLatestSchema().getField(field).getType() instanceof PointField); for (int i = 0; i < 10; i++) { assertU(adoc("id", String.valueOf(i), field, values[i])); }/* w ww .j a v a 2s.co m*/ assertU(commit()); IndexReader ir; RefCounted<SolrIndexSearcher> ref = null; SchemaField sf = h.getCore().getLatestSchema().getField(field); boolean ignoredField = !(sf.indexed() || sf.stored() || sf.hasDocValues()); try { ref = h.getCore().getSearcher(); SolrIndexSearcher searcher = ref.get(); ir = searcher.getIndexReader(); // our own SlowCompositeReader to check DocValues on disk w/o the UninvertingReader added by SolrIndexSearcher final LeafReader leafReaderForCheckingDVs = SlowCompositeReaderWrapper.wrap(searcher.getRawReader()); if (sf.indexed()) { assertEquals("Field " + field + " should have point values", 10, PointValues.size(ir, field)); } else { assertEquals("Field " + field + " should have no point values", 0, PointValues.size(ir, field)); } if (ignoredField) { assertTrue("Field " + field + " should not have docValues", DocValues.getSortedNumeric(leafReaderForCheckingDVs, field) .nextDoc() == DocIdSetIterator.NO_MORE_DOCS); assertTrue("Field " + field + " should not have docValues", DocValues .getNumeric(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS); assertTrue("Field " + field + " should not have docValues", DocValues .getSorted(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS); assertTrue("Field " + field + " should not have docValues", DocValues .getBinary(leafReaderForCheckingDVs, field).nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } else { if (sf.hasDocValues()) { if (sf.multiValued()) { assertFalse("Field " + field + " should have docValues", DocValues.getSortedNumeric(leafReaderForCheckingDVs, field) .nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } else { assertFalse("Field " + field + " should have docValues", DocValues.getNumeric(leafReaderForCheckingDVs, field) .nextDoc() == DocIdSetIterator.NO_MORE_DOCS); } } else { expectThrows(IllegalStateException.class, () -> DocValues.getSortedNumeric(leafReaderForCheckingDVs, field)); expectThrows(IllegalStateException.class, () -> DocValues.getNumeric(leafReaderForCheckingDVs, field)); } expectThrows(IllegalStateException.class, () -> DocValues.getSorted(leafReaderForCheckingDVs, field)); expectThrows(IllegalStateException.class, () -> DocValues.getBinary(leafReaderForCheckingDVs, field)); } for (LeafReaderContext leave : ir.leaves()) { LeafReader reader = leave.reader(); for (int i = 0; i < reader.numDocs(); i++) { Document doc = reader.document(i); if (sf.stored()) { assertNotNull("Field " + field + " not found. Doc: " + doc, doc.get(field)); } else { assertNull(doc.get(field)); } } } } finally { ref.decref(); } clearIndex(); assertU(commit()); }
From source file:org.apache.solr.uninverting.TestFieldCacheVsDocValues.java
License:Apache License
public void testHugeBinaryValues() throws Exception { Analyzer analyzer = new MockAnalyzer(random()); // FSDirectory because SimpleText will consume gobbs of // space when storing big binary values: Directory d = newFSDirectory(createTempDir("hugeBinaryValues")); boolean doFixed = random().nextBoolean(); int numDocs;/*w w w.ja v a 2 s. co m*/ int fixedLength = 0; if (doFixed) { // Sometimes make all values fixed length since some // codecs have different code paths for this: numDocs = TestUtil.nextInt(random(), 10, 20); fixedLength = TestUtil.nextInt(random(), 65537, 256 * 1024); } else { numDocs = TestUtil.nextInt(random(), 100, 200); } IndexWriter w = new IndexWriter(d, newIndexWriterConfig(analyzer)); List<byte[]> docBytes = new ArrayList<>(); long totalBytes = 0; for (int docID = 0; docID < numDocs; docID++) { // we don't use RandomIndexWriter because it might add // more docvalues than we expect !!!! // Must be > 64KB in size to ensure more than 2 pages in // PagedBytes would be needed: int numBytes; if (doFixed) { numBytes = fixedLength; } else if (docID == 0 || random().nextInt(5) == 3) { numBytes = TestUtil.nextInt(random(), 65537, 3 * 1024 * 1024); } else { numBytes = TestUtil.nextInt(random(), 1, 1024 * 1024); } totalBytes += numBytes; if (totalBytes > 5 * 1024 * 1024) { break; } byte[] bytes = new byte[numBytes]; random().nextBytes(bytes); docBytes.add(bytes); Document doc = new Document(); BytesRef b = new BytesRef(bytes); b.length = bytes.length; doc.add(new BinaryDocValuesField("field", b)); doc.add(new StringField("id", "" + docID, Field.Store.YES)); try { w.addDocument(doc); } catch (IllegalArgumentException iae) { if (iae.getMessage().indexOf("is too large") == -1) { throw iae; } else { // OK: some codecs can't handle binary DV > 32K assertFalse(codecAcceptsHugeBinaryValues("field")); w.rollback(); d.close(); return; } } } DirectoryReader r; try { r = DirectoryReader.open(w); } catch (IllegalArgumentException iae) { if (iae.getMessage().indexOf("is too large") == -1) { throw iae; } else { assertFalse(codecAcceptsHugeBinaryValues("field")); // OK: some codecs can't handle binary DV > 32K w.rollback(); d.close(); return; } } w.close(); LeafReader ar = SlowCompositeReaderWrapper.wrap(r); TestUtil.checkReader(ar); BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field"); for (int docID = 0; docID < docBytes.size(); docID++) { Document doc = ar.document(docID); assertEquals(docID, s.nextDoc()); BytesRef bytes = s.binaryValue(); byte[] expected = docBytes.get(Integer.parseInt(doc.get("id"))); assertEquals(expected.length, bytes.length); assertEquals(new BytesRef(expected), bytes); } assertTrue(codecAcceptsHugeBinaryValues("field")); ar.close(); d.close(); }
From source file:org.apache.solr.uninverting.TestFieldCacheVsDocValues.java
License:Apache License
public void testHugeBinaryValueLimit() throws Exception { // We only test DVFormats that have a limit assumeFalse("test requires codec with limits on max binary field length", codecAcceptsHugeBinaryValues("field")); Analyzer analyzer = new MockAnalyzer(random()); // FSDirectory because SimpleText will consume gobbs of // space when storing big binary values: Directory d = newFSDirectory(createTempDir("hugeBinaryValues")); boolean doFixed = random().nextBoolean(); int numDocs;//from w w w. j a v a 2 s . c om int fixedLength = 0; if (doFixed) { // Sometimes make all values fixed length since some // codecs have different code paths for this: numDocs = TestUtil.nextInt(random(), 10, 20); fixedLength = LARGE_BINARY_FIELD_LENGTH; } else { numDocs = TestUtil.nextInt(random(), 100, 200); } IndexWriter w = new IndexWriter(d, newIndexWriterConfig(analyzer)); List<byte[]> docBytes = new ArrayList<>(); long totalBytes = 0; for (int docID = 0; docID < numDocs; docID++) { // we don't use RandomIndexWriter because it might add // more docvalues than we expect !!!! // Must be > 64KB in size to ensure more than 2 pages in // PagedBytes would be needed: int numBytes; if (doFixed) { numBytes = fixedLength; } else if (docID == 0 || random().nextInt(5) == 3) { numBytes = LARGE_BINARY_FIELD_LENGTH; } else { numBytes = TestUtil.nextInt(random(), 1, LARGE_BINARY_FIELD_LENGTH); } totalBytes += numBytes; if (totalBytes > 5 * 1024 * 1024) { break; } byte[] bytes = new byte[numBytes]; random().nextBytes(bytes); docBytes.add(bytes); Document doc = new Document(); BytesRef b = new BytesRef(bytes); b.length = bytes.length; doc.add(new BinaryDocValuesField("field", b)); doc.add(new StringField("id", "" + docID, Field.Store.YES)); w.addDocument(doc); } DirectoryReader r = DirectoryReader.open(w); w.close(); LeafReader ar = SlowCompositeReaderWrapper.wrap(r); TestUtil.checkReader(ar); BinaryDocValues s = FieldCache.DEFAULT.getTerms(ar, "field"); for (int docID = 0; docID < docBytes.size(); docID++) { assertEquals(docID, s.nextDoc()); Document doc = ar.document(docID); BytesRef bytes = s.binaryValue(); byte[] expected = docBytes.get(Integer.parseInt(doc.get("id"))); assertEquals(expected.length, bytes.length); assertEquals(new BytesRef(expected), bytes); } ar.close(); d.close(); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test filtering an index with no fields */// w w w . j a v a2 s . c om public void testEmpty() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); iw.addDocument(new Document()); // open reader DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(Automata.makeString("fieldA"))); // see no fields LeafReader segmentReader = ir.leaves().get(0).reader(); Set<String> seenFields = new HashSet<>(); for (FieldInfo info : segmentReader.getFieldInfos()) { seenFields.add(info.name); } assertEquals(0, seenFields.size()); assertNull(segmentReader.terms("foo")); // see no vectors assertNull(segmentReader.getTermVectors(0)); // see no stored fields Document document = segmentReader.document(0); assertEquals(0, document.getFields().size()); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }
From source file:org.tallison.lucene.search.concordance.TestSimpleAnalyzerUtil.java
License:Apache License
private void executeNeedleTests(Analyzer analyzer) throws Exception { String needle = getNeedle(analyzer); int numFieldValues = 23; Directory directory = buildNeedleIndex(needle, analyzer, numFieldValues); IndexReader reader = DirectoryReader.open(directory); LeafReaderContext ctx = reader.leaves().get(0); LeafReader r = ctx.reader(); PostingsEnum dpe = r.postings(new Term(FIELD, needle), PostingsEnum.ALL); int numTests = 0; try {// w w w .j a va2 s. c o m while (dpe.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int frq = dpe.freq(); int advanced = 0; String[] fieldValues = r.document(dpe.docID()).getValues(FIELD); while (++advanced < frq) { dpe.nextPosition(); String rebuilt = SimpleAnalyzerUtil.substringFromMultiValuedFields(dpe.startOffset(), dpe.endOffset(), fieldValues, analyzer.getOffsetGap(FIELD), " | "); assertEquals(needle, rebuilt); numTests++; } } } finally { reader.close(); directory.close(); } assertEquals("number of tests", numFieldValues - 1, numTests); }