Example usage for org.apache.lucene.index LeafReader terms

List of usage examples for org.apache.lucene.index LeafReader terms

Introduction

In this page you can find the example usage for org.apache.lucene.index LeafReader terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Returns the Terms index for this field, or null if it has none.

Usage

From source file:org.elasticsearch.index.shard.ShardSplittingQuery.java

License:Apache License

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) {
    return new ConstantScoreWeight(this, boost) {
        @Override/*from w ww .java2s  .  co m*/
        public String toString() {
            return "weight(delete docs query)";
        }

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
            LeafReader leafReader = context.reader();
            FixedBitSet bitSet = new FixedBitSet(leafReader.maxDoc());
            Terms terms = leafReader.terms(RoutingFieldMapper.NAME);
            Predicate<BytesRef> includeInShard = ref -> {
                int targetShardId = OperationRouting.generateShardId(indexMetaData,
                        Uid.decodeId(ref.bytes, ref.offset, ref.length), null);
                return shardId == targetShardId;
            };
            if (terms == null) {
                // this is the common case - no partitioning and no _routing values
                // in this case we also don't do anything special with regards to nested docs since we basically delete
                // by ID and parent and nested all have the same id.
                assert indexMetaData.isRoutingPartitionedIndex() == false;
                findSplitDocs(IdFieldMapper.NAME, includeInShard, leafReader, bitSet::set);
            } else {
                final BitSet parentBitSet;
                if (nestedParentBitSetProducer == null) {
                    parentBitSet = null;
                } else {
                    parentBitSet = nestedParentBitSetProducer.getBitSet(context);
                    if (parentBitSet == null) {
                        return null; // no matches
                    }
                }
                if (indexMetaData.isRoutingPartitionedIndex()) {
                    // this is the heaviest invariant. Here we have to visit all docs stored fields do extract _id and _routing
                    // this this index is routing partitioned.
                    Visitor visitor = new Visitor(leafReader);
                    TwoPhaseIterator twoPhaseIterator = parentBitSet == null
                            ? new RoutingPartitionedDocIdSetIterator(visitor)
                            : new NestedRoutingPartitionedDocIdSetIterator(visitor, parentBitSet);
                    return new ConstantScoreScorer(this, score(), twoPhaseIterator);
                } else {
                    // here we potentially guard the docID consumers with our parent bitset if we have one.
                    // this ensures that we are only marking root documents in the nested case and if necessary
                    // we do a second pass to mark the corresponding children in markChildDocs
                    Function<IntConsumer, IntConsumer> maybeWrapConsumer = consumer -> {
                        if (parentBitSet != null) {
                            return docId -> {
                                if (parentBitSet.get(docId)) {
                                    consumer.accept(docId);
                                }
                            };
                        }
                        return consumer;
                    };
                    // in the _routing case we first go and find all docs that have a routing value and mark the ones we have to delete
                    findSplitDocs(RoutingFieldMapper.NAME, ref -> {
                        int targetShardId = OperationRouting.generateShardId(indexMetaData, null,
                                ref.utf8ToString());
                        return shardId == targetShardId;
                    }, leafReader, maybeWrapConsumer.apply(bitSet::set));

                    // now if we have a mixed index where some docs have a _routing value and some don't we have to exclude the ones
                    // with a routing value from the next iteration an delete / select based on the ID.
                    if (terms.getDocCount() != leafReader.maxDoc()) {
                        // this is a special case where some of the docs have no routing values this sucks but it's possible today
                        FixedBitSet hasRoutingValue = new FixedBitSet(leafReader.maxDoc());
                        findSplitDocs(RoutingFieldMapper.NAME, ref -> false, leafReader,
                                maybeWrapConsumer.apply(hasRoutingValue::set));
                        IntConsumer bitSetConsumer = maybeWrapConsumer.apply(bitSet::set);
                        findSplitDocs(IdFieldMapper.NAME, includeInShard, leafReader, docId -> {
                            if (hasRoutingValue.get(docId) == false) {
                                bitSetConsumer.accept(docId);
                            }
                        });
                    }
                }
                if (parentBitSet != null) {
                    // if nested docs are involved we also need to mark all child docs that belong to a matching parent doc.
                    markChildDocs(parentBitSet, bitSet);
                }
            }

            return new ConstantScoreScorer(this, score(), new BitSetIterator(bitSet, bitSet.length()));
        }

        @Override
        public boolean isCacheable(LeafReaderContext ctx) {
            // This is not a regular query, let's not cache it. It wouldn't help
            // anyway.
            return false;
        }
    };
}

From source file:org.elasticsearch.index.shard.ShardSplittingQuery.java

License:Apache License

private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
        IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;/*from  w  ww . j a v a 2s  .  c  om*/
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java

License:Open Source License

/**
 * test filtering two string fields//from  w w  w .j  a v  a 2  s  .c o m
 */
public void testIndexed() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(null);
    IndexWriter iw = new IndexWriter(dir, iwc);

    // add document with 2 fields
    Document doc = new Document();
    doc.add(new StringField("fieldA", "test", Field.Store.NO));
    doc.add(new StringField("fieldB", "test", Field.Store.NO));
    iw.addDocument(doc);

    // open reader
    DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw),
            new CharacterRunAutomaton(Automata.makeString("fieldA")));

    // see only one field
    LeafReader segmentReader = ir.leaves().get(0).reader();
    Set<String> seenFields = new HashSet<>();
    for (FieldInfo info : segmentReader.getFieldInfos()) {
        seenFields.add(info.name);
    }
    assertEquals(Collections.singleton("fieldA"), seenFields);
    assertNotNull(segmentReader.terms("fieldA"));
    assertNull(segmentReader.terms("fieldB"));

    TestUtil.checkReader(ir);
    IOUtils.close(ir, iw, dir);
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java

License:Open Source License

/**
 * test special handling for _field_names field.
 *///from ww  w . jav a2  s .  com
public void testFieldNames() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(null);
    IndexWriter iw = new IndexWriter(dir, iwc);

    // add document with 2 fields
    Document doc = new Document();
    doc.add(new StringField("fieldA", "test", Field.Store.NO));
    doc.add(new StringField("fieldB", "test", Field.Store.NO));
    doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldA", Field.Store.NO));
    doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldB", Field.Store.NO));
    iw.addDocument(doc);

    // open reader
    Set<String> fields = new HashSet<>();
    fields.add("fieldA");
    Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", FieldNamesFieldMapper.NAME));
    DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton));

    // see only one field
    LeafReader segmentReader = ir.leaves().get(0).reader();
    Terms terms = segmentReader.terms(FieldNamesFieldMapper.NAME);
    TermsEnum termsEnum = terms.iterator();
    assertEquals(new BytesRef("fieldA"), termsEnum.next());
    assertNull(termsEnum.next());

    // seekExact 
    termsEnum = terms.iterator();
    assertTrue(termsEnum.seekExact(new BytesRef("fieldA")));
    assertFalse(termsEnum.seekExact(new BytesRef("fieldB")));

    // seekCeil 
    termsEnum = terms.iterator();
    assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("fieldA")));
    assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("field0000")));
    assertEquals(new BytesRef("fieldA"), termsEnum.term());
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldAAA")));
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldB")));

    TestUtil.checkReader(ir);
    IOUtils.close(ir, iw, dir);
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java

License:Open Source License

/**
 * test special handling for _field_names field (three fields, to exercise termsenum better)
 *///w w w .ja v a2s . c  om
public void testFieldNamesThreeFields() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(null);
    IndexWriter iw = new IndexWriter(dir, iwc);

    // add document with 2 fields
    Document doc = new Document();
    doc.add(new StringField("fieldA", "test", Field.Store.NO));
    doc.add(new StringField("fieldB", "test", Field.Store.NO));
    doc.add(new StringField("fieldC", "test", Field.Store.NO));
    doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldA", Field.Store.NO));
    doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldB", Field.Store.NO));
    doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldC", Field.Store.NO));
    iw.addDocument(doc);

    // open reader
    Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", "fieldC", FieldNamesFieldMapper.NAME));
    DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton));

    // see only two fields
    LeafReader segmentReader = ir.leaves().get(0).reader();
    Terms terms = segmentReader.terms(FieldNamesFieldMapper.NAME);
    TermsEnum termsEnum = terms.iterator();
    assertEquals(new BytesRef("fieldA"), termsEnum.next());
    assertEquals(new BytesRef("fieldC"), termsEnum.next());
    assertNull(termsEnum.next());

    // seekExact 
    termsEnum = terms.iterator();
    assertTrue(termsEnum.seekExact(new BytesRef("fieldA")));
    assertFalse(termsEnum.seekExact(new BytesRef("fieldB")));
    assertTrue(termsEnum.seekExact(new BytesRef("fieldC")));

    // seekCeil 
    termsEnum = terms.iterator();
    assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("fieldA")));
    assertEquals(SeekStatus.NOT_FOUND, termsEnum.seekCeil(new BytesRef("fieldB")));
    assertEquals(new BytesRef("fieldC"), termsEnum.term());
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldD")));

    TestUtil.checkReader(ir);
    IOUtils.close(ir, iw, dir);
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java

License:Open Source License

/**
 * test _field_names where a field is permitted, but doesn't exist in the segment.
 *//* www  .j  ava2 s.c  om*/
public void testFieldNamesMissing() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(null);
    IndexWriter iw = new IndexWriter(dir, iwc);

    // add document with 2 fields
    Document doc = new Document();
    doc.add(new StringField("fieldA", "test", Field.Store.NO));
    doc.add(new StringField("fieldB", "test", Field.Store.NO));
    doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldA", Field.Store.NO));
    doc.add(new StringField(FieldNamesFieldMapper.NAME, "fieldB", Field.Store.NO));
    iw.addDocument(doc);

    // open reader
    Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", "fieldC", FieldNamesFieldMapper.NAME));
    DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton));

    // see only one field
    LeafReader segmentReader = ir.leaves().get(0).reader();
    Terms terms = segmentReader.terms(FieldNamesFieldMapper.NAME);

    // seekExact 
    TermsEnum termsEnum = terms.iterator();
    assertFalse(termsEnum.seekExact(new BytesRef("fieldC")));

    // seekCeil 
    termsEnum = terms.iterator();
    assertEquals(SeekStatus.END, termsEnum.seekCeil(new BytesRef("fieldC")));

    TestUtil.checkReader(ir);
    IOUtils.close(ir, iw, dir);
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java

License:Open Source License

/**
 * test where _field_names does not exist
 *///from  w w  w  .  j ava  2  s. co  m
public void testFieldNamesOldIndex() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(null);
    IndexWriter iw = new IndexWriter(dir, iwc);

    // add document with 2 fields
    Document doc = new Document();
    doc.add(new StringField("fieldA", "test", Field.Store.NO));
    doc.add(new StringField("fieldB", "test", Field.Store.NO));
    iw.addDocument(doc);

    // open reader
    Automaton automaton = Automatons.patterns(Arrays.asList("fieldA", SourceFieldMapper.NAME));
    DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw), new CharacterRunAutomaton(automaton));

    // see only one field
    LeafReader segmentReader = ir.leaves().get(0).reader();
    assertNull(segmentReader.terms(FieldNamesFieldMapper.NAME));

    TestUtil.checkReader(ir);
    IOUtils.close(ir, iw, dir);
}

From source file:org.elasticsearch.xpack.core.security.authz.accesscontrol.FieldSubsetReaderTests.java

License:Open Source License

/**
 * test filtering an index with no fields
 *//*from  www . j av  a2  s.c  o  m*/
public void testEmpty() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = new IndexWriterConfig(null);
    IndexWriter iw = new IndexWriter(dir, iwc);
    iw.addDocument(new Document());

    // open reader
    DirectoryReader ir = FieldSubsetReader.wrap(DirectoryReader.open(iw),
            new CharacterRunAutomaton(Automata.makeString("fieldA")));

    // see no fields
    LeafReader segmentReader = ir.leaves().get(0).reader();
    Set<String> seenFields = new HashSet<>();
    for (FieldInfo info : segmentReader.getFieldInfos()) {
        seenFields.add(info.name);
    }
    assertEquals(0, seenFields.size());
    assertNull(segmentReader.terms("foo"));

    // see no vectors
    assertNull(segmentReader.getTermVectors(0));

    // see no stored fields
    Document document = segmentReader.document(0);
    assertEquals(0, document.getFields().size());

    TestUtil.checkReader(ir);
    IOUtils.close(ir, iw, dir);
}

From source file:org.tallison.gramreaper.terms.DumpTerms.java

License:Apache License

private void dumpTopNField(LeafReader leafReader, String field) throws IOException {
    AbstractTokenTFDFPriorityQueue queue = config.sort.equals(DumpTermsConfig.SORT.DF)
            ? new TokenDFPriorityQueue(config.topN)
            : new TokenTFPriorityQueue(config.topN);
    Terms terms = leafReader.terms(field);
    if (terms == null) {
        StringBuilder sb = new StringBuilder();
        int i = 0;
        for (FieldInfo fieldInfo : leafReader.getFieldInfos()) {
            if (i++ > 0) {
                sb.append("\n");
            }/*from w  w  w .  j a  va2s.c om*/
            sb.append(fieldInfo.name);

        }
        throw new RuntimeException("I can't find field \"" + field + "\".\n" + "I only see:\n" + sb.toString());
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    int docsWThisField = leafReader.getDocCount(field);
    while (bytesRef != null) {
        int df = termsEnum.docFreq();
        long tf = termsEnum.totalTermFreq();
        if (config.minDocFreq > -1 && df < config.minDocFreq) {
            bytesRef = termsEnum.next();
            continue;
        }
        if (config.minDocPercentage > -1.0d
                && (double) df / (double) docsWThisField < config.minDocPercentage) {
            bytesRef = termsEnum.next();
            continue;
        }

        if (queue.top() == null || queue.size() < config.topN
                || (config.sort.equals(DumpTermsConfig.SORT.DF) ? df >= queue.top().df : tf > queue.top().tf)) {
            String t = bytesRef.utf8ToString();
            if (!config.stopWords.contains(t) && !config.startWords.contains(t)) {

                queue.insertWithOverflow(new TokenDFTF(t, df, tf));
            }
        }
        bytesRef = termsEnum.next();
    }
    if (config.outputFile == null) {
        StringBuilder sb = new StringBuilder();
        for (TokenDFTF tp : queue.getArray()) {
            System.out.println(getRow(sb, tp));
        }
    } else if (Files.isDirectory(config.outputFile)) {
        writeTopN(config.outputFile.resolve(field), queue);
    } else {
        writeTopN(config.outputFile, queue);
    }
}

From source file:org.voyanttools.trombone.input.index.LuceneIndexer.java

License:Open Source License

public String index(List<StoredDocumentSource> storedDocumentSources) throws IOException {

    // let's check if we need to create new sources because of tokenization parameters
    if (parameters.getParameterValue("tokenization", "").isEmpty() == false
            || parameters.getParameterValue("language", "").isEmpty() == false) {
        StoredDocumentSourceStorage sourceDocumentSourceStorage = storage.getStoredDocumentSourceStorage();
        String tokenizationParam = parameters.getParameterValue("tokenization", "");
        String langParam = parameters.getParameterValue("language", "");
        for (int i = 0, len = storedDocumentSources.size(); i < len; i++) {
            StoredDocumentSource storedDocumentSource = storedDocumentSources.get(i);
            String id = storedDocumentSource.getId();
            String newId = DigestUtils.md5Hex(id + tokenizationParam + langParam);
            InputStream inputStream = sourceDocumentSourceStorage.getStoredDocumentSourceInputStream(id);
            DocumentMetadata metadata = storedDocumentSource.getMetadata();
            metadata.setLastTokenPositionIndex(TokenType.lexical, 0); // this is crucial to ensure that document is re-analyzed and metadata re-rewritten
            InputSource inputSource = new InputStreamInputSource(newId, metadata, inputStream);
            storedDocumentSources.set(i, sourceDocumentSourceStorage.getStoredDocumentSource(inputSource));
            inputStream.close();//from   www .j  av a  2  s.  co m
        }
    }

    List<String> ids = new ArrayList<String>();
    for (StoredDocumentSource storedDocumentSource : storedDocumentSources) {
        ids.add(storedDocumentSource.getId());
    }
    String corpusId = storage.storeStrings(ids, Storage.Location.object);

    // determine if we need to modify the Lucene index
    Collection<StoredDocumentSource> storedDocumentSourceForLucene = new ArrayList<StoredDocumentSource>();
    if (storage.getLuceneManager().directoryExists(corpusId)) {
        LeafReader reader = SlowCompositeReaderWrapper
                .wrap(storage.getLuceneManager().getDirectoryReader(corpusId));
        Terms terms = reader.terms("id");
        if (terms == null) {
            storedDocumentSourceForLucene.addAll(storedDocumentSources);
        } else {
            TermsEnum termsEnum = terms.iterator();
            for (StoredDocumentSource storedDocumentSource : storedDocumentSources) {
                String id = storedDocumentSource.getId();
                if (!termsEnum.seekExact(new BytesRef(id))) {
                    storedDocumentSourceForLucene.add(storedDocumentSource);
                }
            }
        }
    } else {
        storedDocumentSourceForLucene.addAll(storedDocumentSources);
    }

    if (storedDocumentSourceForLucene.isEmpty() == false) {
        //         Temporal start = Instant.now();
        //         System.out.println(start);
        //         if (parameters.getParameterBooleanValue("stream")) {
        //            System.out.println("stream");
        //            indexStream(storedDocumentSourceForLucene, corpusId);
        //         } else {
        //            System.out.println("executor");
        indexExecutorService(storedDocumentSourceForLucene, corpusId);
        //         }
        //         Temporal end = Instant.now();
        //         ChronoUnit.SECONDS.between(start, end);
        //         System.out.println(ChronoUnit.SECONDS.between(start, end));
    }

    return corpusId;

}