List of usage examples for org.apache.lucene.index LeafReader terms
public abstract Terms terms(String field) throws IOException;
From source file:org.elasticsearch.index.shard.ShardSplittingQuery.java
License:Apache License
@Override public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) { return new ConstantScoreWeight(this, boost) { @Override/*from w ww .java2s . co m*/ public String toString() { return "weight(delete docs query)"; } @Override public Scorer scorer(LeafReaderContext context) throws IOException { LeafReader leafReader = context.reader(); FixedBitSet bitSet = new FixedBitSet(leafReader.maxDoc()); Terms terms = leafReader.terms(RoutingFieldMapper.NAME); Predicate<BytesRef> includeInShard = ref -> { int targetShardId = OperationRouting.generateShardId(indexMetaData, Uid.decodeId(ref.bytes, ref.offset, ref.length), null); return shardId == targetShardId; }; if (terms == null) { // this is the common case - no partitioning and no _routing values // in this case we also don't do anything special with regards to nested docs since we basically delete // by ID and parent and nested all have the same id. assert indexMetaData.isRoutingPartitionedIndex() == false; findSplitDocs(IdFieldMapper.NAME, includeInShard, leafReader, bitSet::set); } else { final BitSet parentBitSet; if (nestedParentBitSetProducer == null) { parentBitSet = null; } else { parentBitSet = nestedParentBitSetProducer.getBitSet(context); if (parentBitSet == null) { return null; // no matches } } if (indexMetaData.isRoutingPartitionedIndex()) { // this is the heaviest invariant. Here we have to visit all docs stored fields do extract _id and _routing // this this index is routing partitioned. Visitor visitor = new Visitor(leafReader); TwoPhaseIterator twoPhaseIterator = parentBitSet == null ? new RoutingPartitionedDocIdSetIterator(visitor) : new NestedRoutingPartitionedDocIdSetIterator(visitor, parentBitSet); return new ConstantScoreScorer(this, score(), twoPhaseIterator); } else { // here we potentially guard the docID consumers with our parent bitset if we have one. // this ensures that we are only marking root documents in the nested case and if necessary // we do a second pass to mark the corresponding children in markChildDocs Function<IntConsumer, IntConsumer> maybeWrapConsumer = consumer -> { if (parentBitSet != null) { return docId -> { if (parentBitSet.get(docId)) { consumer.accept(docId); } }; } return consumer; }; // in the _routing case we first go and find all docs that have a routing value and mark the ones we have to delete findSplitDocs(RoutingFieldMapper.NAME, ref -> { int targetShardId = OperationRouting.generateShardId(indexMetaData, null, ref.utf8ToString()); return shardId == targetShardId; }, leafReader, maybeWrapConsumer.apply(bitSet::set)); // now if we have a mixed index where some docs have a _routing value and some don't we have to exclude the ones // with a routing value from the next iteration an delete / select based on the ID. if (terms.getDocCount() != leafReader.maxDoc()) { // this is a special case where some of the docs have no routing values this sucks but it's possible today FixedBitSet hasRoutingValue = new FixedBitSet(leafReader.maxDoc()); findSplitDocs(RoutingFieldMapper.NAME, ref -> false, leafReader, maybeWrapConsumer.apply(hasRoutingValue::set)); IntConsumer bitSetConsumer = maybeWrapConsumer.apply(bitSet::set); findSplitDocs(IdFieldMapper.NAME, includeInShard, leafReader, docId -> { if (hasRoutingValue.get(docId) == false) { bitSetConsumer.accept(docId); } }); } } if (parentBitSet != null) { // if nested docs are involved we also need to mark all child docs that belong to a matching parent doc. markChildDocs(parentBitSet, bitSet); } } return new ConstantScoreScorer(this, score(), new BitSetIterator(bitSet, bitSet.length())); } @Override public boolean isCacheable(LeafReaderContext ctx) { // This is not a regular query, let's not cache it. It wouldn't help // anyway. return false; } }; }
From source file:org.elasticsearch.index.shard.ShardSplittingQuery.java
License:Apache License
private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader, IntConsumer consumer) throws IOException { Terms terms = leafReader.terms(idField); TermsEnum iterator = terms.iterator(); BytesRef idTerm;/*from w ww . j a v a 2s . c om*/ PostingsEnum postingsEnum = null; while ((idTerm = iterator.next()) != null) { if (includeInShard.test(idTerm) == false) { postingsEnum = iterator.postings(postingsEnum); int doc; while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { consumer.accept(doc); } } } }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test filtering two string fields//from w w w .j a v a 2 s .c o m */ public void testIndexed() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); // add document with 2 fields Document doc = new Document(); doc.add(new StringField("fieldA", "test", Field.Store.NO)); doc.add(new StringField("fieldB", "test", Field.Store.NO)); iw.addDocument(doc); // open reader DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(Automata.makeString("fieldA"))); // see only one field LeafReader segmentReader = ir.leaves().get(0).reader(); Set<String> seenFields = new HashSet<>(); for (FieldInfo info : segmentReader.getFieldInfos()) { seenFields.add(info.name); } assertEquals(Collections.singleton("fieldA"), seenFields); assertNotNull(segmentReader.terms("fieldA")); assertNull(segmentReader.terms("fieldB")); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test special handling for _field_names field. *///from ww w . jav a2 s . com public void testFieldNames() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); // add document with 2 fields Document doc = new Document(); doc.add(new StringField("fieldA", "test", Field.Store.NO)); doc.add(new StringField("fieldB", "test", Field.Store.NO)); doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldA", Field.Store.NO)); doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldB", Field.Store.NO)); iw.addDocument(doc); // open reader Set<String> fields = new HashSet<>(); fields.add("fieldA"); Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", FieldNamesFieldMapper.NAME)); DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton)); // see only one field LeafReader segmentReader = ir.leaves().get(0).reader(); Terms terms = segmentReader.terms(FieldNamesFieldMapper.NAME); TermsEnum termsEnum = terms.iterator(); assertEquals(new BytesRef("fieldA"), termsEnum.next()); assertNull(termsEnum.next()); // seekExact termsEnum = terms.iterator(); assertTrue(termsEnum.seekExact(new BytesRef("fieldA"))); assertFalse(termsEnum.seekExact(new BytesRef("fieldB"))); // seekCeil termsEnum = terms.iterator(); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("fieldA"))); assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("field0000"))); assertEquals(new BytesRef("fieldA"), termsEnum.term()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldAAA"))); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldB"))); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test special handling for _field_names field (three fields, to exercise termsenum better) *///w w w .ja v a2s . c om public void testFieldNamesThreeFields() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); // add document with 2 fields Document doc = new Document(); doc.add(new StringField("fieldA", "test", Field.Store.NO)); doc.add(new StringField("fieldB", "test", Field.Store.NO)); doc.add(new StringField("fieldC", "test", Field.Store.NO)); doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldA", Field.Store.NO)); doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldB", Field.Store.NO)); doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldC", Field.Store.NO)); iw.addDocument(doc); // open reader Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", "fieldC", FieldNamesFieldMapper.NAME)); DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton)); // see only two fields LeafReader segmentReader = ir.leaves().get(0).reader(); Terms terms = segmentReader.terms(FieldNamesFieldMapper.NAME); TermsEnum termsEnum = terms.iterator(); assertEquals(new BytesRef("fieldA"), termsEnum.next()); assertEquals(new BytesRef("fieldC"), termsEnum.next()); assertNull(termsEnum.next()); // seekExact termsEnum = terms.iterator(); assertTrue(termsEnum.seekExact(new BytesRef("fieldA"))); assertFalse(termsEnum.seekExact(new BytesRef("fieldB"))); assertTrue(termsEnum.seekExact(new BytesRef("fieldC"))); // seekCeil termsEnum = terms.iterator(); assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("fieldA"))); assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("fieldB"))); assertEquals(new BytesRef("fieldC"), termsEnum.term()); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldD"))); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test _field_names where a field is permitted, but doesn't exist in the segment. *//* www .j ava2 s.c om*/ public void testFieldNamesMissing() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); // add document with 2 fields Document doc = new Document(); doc.add(new StringField("fieldA", "test", Field.Store.NO)); doc.add(new StringField("fieldB", "test", Field.Store.NO)); doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldA", Field.Store.NO)); doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldB", Field.Store.NO)); iw.addDocument(doc); // open reader Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", "fieldC", FieldNamesFieldMapper.NAME)); DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton)); // see only one field LeafReader segmentReader = ir.leaves().get(0).reader(); Terms terms = segmentReader.terms(FieldNamesFieldMapper.NAME); // seekExact TermsEnum termsEnum = terms.iterator(); assertFalse(termsEnum.seekExact(new BytesRef("fieldC"))); // seekCeil termsEnum = terms.iterator(); assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldC"))); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test where _field_names does not exist *///from w w w . j ava 2 s. co m public void testFieldNamesOldIndex() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); // add document with 2 fields Document doc = new Document(); doc.add(new StringField("fieldA", "test", Field.Store.NO)); doc.add(new StringField("fieldB", "test", Field.Store.NO)); iw.addDocument(doc); // open reader Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", SourceFieldMapper.NAME)); DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton)); // see only one field LeafReader segmentReader = ir.leaves().get(0).reader(); assertNull(segmentReader.terms(FieldNamesFieldMapper.NAME)); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }
From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java
License:Open Source License
/** * test filtering an index with no fields *//*from www . j av a2 s.c o m*/ public void testEmpty() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(null); IndexWriter iw = new IndexWriter(dir, iwc); iw.addDocument(new Document()); // open reader DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(Automata.makeString("fieldA"))); // see no fields LeafReader segmentReader = ir.leaves().get(0).reader(); Set<String> seenFields = new HashSet<>(); for (FieldInfo info : segmentReader.getFieldInfos()) { seenFields.add(info.name); } assertEquals(0, seenFields.size()); assertNull(segmentReader.terms("foo")); // see no vectors assertNull(segmentReader.getTermVectors(0)); // see no stored fields Document document = segmentReader.document(0); assertEquals(0, document.getFields().size()); TestUtil.checkReader(ir); IOUtils.close(ir, iw, dir); }
From source file:org.tallison.gramreaper.terms.DumpTerms.java
License:Apache License
private void dumpTopNField(LeafReader leafReader, String field) throws IOException { AbstractTokenTFDFPriorityQueue queue = config.sort.equals(DumpTermsConfig.SORT.DF) ? new TokenDFPriorityQueue(config.topN) : new TokenTFPriorityQueue(config.topN); Terms terms = leafReader.terms(field); if (terms == null) { StringBuilder sb = new StringBuilder(); int i = 0; for (FieldInfo fieldInfo : leafReader.getFieldInfos()) { if (i++ > 0) { sb.append("\n"); }/*from w w w . j a va2s.c om*/ sb.append(fieldInfo.name); } throw new RuntimeException("I can't find field \"" + field + "\".\n" + "I only see:\n" + sb.toString()); } TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int docsWThisField = leafReader.getDocCount(field); while (bytesRef != null) { int df = termsEnum.docFreq(); long tf = termsEnum.totalTermFreq(); if (config.minDocFreq > -1 && df < config.minDocFreq) { bytesRef = termsEnum.next(); continue; } if (config.minDocPercentage > -1.0d && (double) df / (double) docsWThisField < config.minDocPercentage) { bytesRef = termsEnum.next(); continue; } if (queue.top() == null || queue.size() < config.topN || (config.sort.equals(DumpTermsConfig.SORT.DF) ? df >= queue.top().df : tf > queue.top().tf)) { String t = bytesRef.utf8ToString(); if (!config.stopWords.contains(t) && !config.startWords.contains(t)) { queue.insertWithOverflow(new TokenDFTF(t, df, tf)); } } bytesRef = termsEnum.next(); } if (config.outputFile == null) { StringBuilder sb = new StringBuilder(); for (TokenDFTF tp : queue.getArray()) { System.out.println(getRow(sb, tp)); } } else if (Files.isDirectory(config.outputFile)) { writeTopN(config.outputFile.resolve(field), queue); } else { writeTopN(config.outputFile, queue); } }
From source file:org.voyanttools.trombone.input.index.LuceneIndexer.java
License:Open Source License
public String index(List<StoredDocumentSource> storedDocumentSources) throws IOException { // let's check if we need to create new sources because of tokenization parameters if (parameters.getParameterValue("tokenization", "").isEmpty() == false || parameters.getParameterValue("language", "").isEmpty() == false) { StoredDocumentSourceStorage sourceDocumentSourceStorage = storage.getStoredDocumentSourceStorage(); String tokenizationParam = parameters.getParameterValue("tokenization", ""); String langParam = parameters.getParameterValue("language", ""); for (int i = 0, len = storedDocumentSources.size(); i < len; i++) { StoredDocumentSource storedDocumentSource = storedDocumentSources.get(i); String id = storedDocumentSource.getId(); String newId = DigestUtils.md5Hex(id + tokenizationParam + langParam); InputStream inputStream = sourceDocumentSourceStorage.getStoredDocumentSourceInputStream(id); DocumentMetadata metadata = storedDocumentSource.getMetadata(); metadata.setLastTokenPositionIndex(TokenType.lexical, 0); // this is crucial to ensure that document is re-analyzed and metadata re-rewritten InputSource inputSource = new InputStreamInputSource(newId, metadata, inputStream); storedDocumentSources.set(i, sourceDocumentSourceStorage.getStoredDocumentSource(inputSource)); inputStream.close();//from www .j av a 2 s. co m } } List<String> ids = new ArrayList<String>(); for (StoredDocumentSource storedDocumentSource : storedDocumentSources) { ids.add(storedDocumentSource.getId()); } String corpusId = storage.storeStrings(ids, Storage.Location.object); // determine if we need to modify the Lucene index Collection<StoredDocumentSource> storedDocumentSourceForLucene = new ArrayList<StoredDocumentSource>(); if (storage.getLuceneManager().directoryExists(corpusId)) { LeafReader reader = SlowCompositeReaderWrapper .wrap(storage.getLuceneManager().getDirectoryReader(corpusId)); Terms terms = reader.terms("id"); if (terms == null) { storedDocumentSourceForLucene.addAll(storedDocumentSources); } else { TermsEnum termsEnum = terms.iterator(); for (StoredDocumentSource storedDocumentSource : storedDocumentSources) { String id = storedDocumentSource.getId(); if (!termsEnum.seekExact(new BytesRef(id))) { storedDocumentSourceForLucene.add(storedDocumentSource); } } } } else { storedDocumentSourceForLucene.addAll(storedDocumentSources); } if (storedDocumentSourceForLucene.isEmpty() == false) { // Temporal start = Instant.now(); // System.out.println(start); // if (parameters.getParameterBooleanValue("stream")) { // System.out.println("stream"); // indexStream(storedDocumentSourceForLucene, corpusId); // } else { // System.out.println("executor"); indexExecutorService(storedDocumentSourceForLucene, corpusId); // } // Temporal end = Instant.now(); // ChronoUnit.SECONDS.between(start, end); // System.out.println(ChronoUnit.SECONDS.between(start, end)); } return corpusId; }