Example usage for org.apache.lucene.index IndexReader leaves

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader leaves.

Prototype

public final List<LeafReaderContext> leaves()

Source Link

Document

Returns the reader's leaves, or itself if this reader is atomic.

Usage

From source file:IndexAndSearchOpenStreetMaps1D.java

License:Apache License

private static void queryIndex() throws IOException {
    Directory dir = FSDirectory.open(Paths.get("/l/tmp/1dkd" + (USE_NF ? "_nf" : "")));
    System.out.println("DIR: " + dir);
    IndexReader r = DirectoryReader.open(dir);
    System.out.println("maxDoc=" + r.maxDoc());

    IndexSearcher s = new IndexSearcher(r);

    //System.out.println("reader MB heap=" + (reader.ramBytesUsed()/1024/1024.));

    // London, UK:
    int STEPS = 5;
    double MIN_LAT = 51.0919106;
    double MAX_LAT = 51.6542719;
    double MIN_LON = -0.3867282;
    double MAX_LON = 0.8492337;
    byte[] scratch1 = new byte[4];
    byte[] scratch2 = new byte[4];
    for (int iter = 0; iter < 100; iter++) {
        long tStart = System.nanoTime();
        long totHits = 0;
        int queryCount = 0;
        for (int latStep = 0; latStep < STEPS; latStep++) {
            double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS;
            for (int lonStep = 0; lonStep < STEPS; lonStep++) {
                double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS;
                for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) {
                    double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS;
                    for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) {
                        double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS;

                        Query q;//from w w w .j a  va 2s.  c  om
                        if (USE_NF) {
                            q = LegacyNumericRangeQuery.newIntRange("latnum", (int) (1000000. * lat),
                                    (int) (1000000. * latEnd), true, true);
                        } else {
                            q = IntPoint.newRangeQuery("lat", (int) (1000000. * lat),
                                    (int) (1000000. * latEnd));
                        }

                        TotalHitCountCollector c = new TotalHitCountCollector();
                        //long t0 = System.nanoTime();
                        s.search(q, c);

                        //System.out.println("\nITER: now query lat=" + lat + " latEnd=" + latEnd + " lon=" + lon + " lonEnd=" + lonEnd);
                        //Bits hits = reader.intersect(lat, latEnd, lon, lonEnd);
                        //System.out.println("  total hits: " + hitCount);
                        //totHits += ((FixedBitSet) hits).cardinality();
                        //System.out.println("  add tot " + c.getTotalHits());
                        totHits += c.getTotalHits();
                        queryCount++;
                    }
                }
            }
        }

        long tEnd = System.nanoTime();
        System.out.println("ITER: " + iter + " " + ((tEnd - tStart) / 1000000000.0) + " sec; totHits=" + totHits
                + "; " + queryCount + " queries");

        if (iter == 0) {
            long bytes = 0;
            for (LeafReaderContext ctx : r.leaves()) {
                CodecReader cr = (CodecReader) ctx.reader();
                System.out.println(Accountables.toString(cr));
                bytes += cr.ramBytesUsed();
            }
            System.out.println("READER MB: " + (bytes / 1024. / 1024.));
            System.out.println("RAM: " + Accountables.toString((Accountable) r.leaves().get(0).reader()));
        }
    }

    IOUtils.close(r, dir);
}

From source file:com.basistech.lucene.tools.LuceneQueryTool.java

License:Apache License

LuceneQueryTool(IndexReader reader, PrintStream out) throws IOException {
    this.indexReader = reader;
    this.outputLimit = Integer.MAX_VALUE;
    this.analyzer = new KeywordAnalyzer();
    this.fieldNames = Lists.newArrayList();
    this.defaultOut = out;
    allFieldNames = Sets.newTreeSet();//from w  ww  .  j a v a  2s. c  o  m
    for (LeafReaderContext leaf : reader.leaves()) {
        for (FieldInfo fieldInfo : leaf.reader().getFieldInfos()) {
            allFieldNames.add(fieldInfo.name);
        }
    }
    this.formatter = Formatter.newInstance(Formatter.Format.MULTILINE, false);
}

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

@Test
public void testItalianWikipedia() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {/* w  w w .j  a  v a 2s.c o m*/
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    DirectoryReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    try {
        Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo",
                "la", "i", "gli", "le");
        CharArraySet stopWords = new CharArraySet(stopWordsList, true);
        Analyzer analyzer = new ItalianAnalyzer(stopWords);
        if (index) {

            System.out.format("Indexing Italian Wikipedia...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            for (LeafReaderContext context : reader.leaves()) {
                datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD,
                        TEXT_FIELD, CATEGORY_FIELD);
            }
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        List<Classifier<BytesRef>> classifiers = new LinkedList<>();
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {

            final IndexReader finalReader = reader;
            final DirectoryReader finalTestReader = testReader;
            futures.add(service.submit(() -> {
                ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix;
                if (split) {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                } else {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                }

                final long endTime = System.currentTimeMillis();
                final int elapse = (int) (endTime - startTime) / 1000;

                return " * " + classifier + " \n    * accuracy = " + confusionMatrix.getAccuracy()
                        + "\n    * precision = " + confusionMatrix.getPrecision() + "\n    * recall = "
                        + confusionMatrix.getRecall() + "\n    * f1-measure = " + confusionMatrix.getF1Measure()
                        + "\n    * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime()
                        + "\n    * time = " + elapse + " (sec)\n ";
            }));

        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
            if (directory != null) {
                directory.close();
            }
            if (test != null) {
                test.close();
            }
            if (train != null) {
                train.close();
            }
            if (cv != null) {
                cv.close();
            }
            if (testReader != null) {
                testReader.close();
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

From source file:com.lucid.solr.sidecar.SidecarIndexReader.java

License:Apache License

static AtomicReader[] getSequentialSubReaders(IndexReader r) {
    List<AtomicReaderContext> leaves = r.leaves();
    AtomicReader subReaders[] = new AtomicReader[leaves.size()];
    for (int i = 0; i < leaves.size(); i++) {
        subReaders[i] = leaves.get(i).reader();
    }/*  w w w  . jav a2s  .c  om*/
    return subReaders;
}

From source file:com.meltwater.elasticsearch.index.BatchPercolatorService.java

License:Apache License

private boolean hasDocumentMatchingFilter(IndexReader reader, Optional<Filter> optionalFilter)
        throws IOException {
    if (optionalFilter.isPresent()) {
        Filter filter = optionalFilter.get();
        boolean found = false;
        // If you are not familiar with Lucene, this basically means that we try to
        // create an iterator for valid id:s for the filter for the given reader.
        // The filter and DocIdSet can both return null, to enable optimisations,
        // thus the null-checks. Null means that there were no matching docs, and
        // the same is true if the iterator refers to NO_MORE_DOCS immediately.
        for (AtomicReaderContext leaf : reader.leaves()) {
            DocIdSet idSet = filter.getDocIdSet(leaf, leaf.reader().getLiveDocs());
            if (idSet != null) {
                DocIdSetIterator iter = idSet.iterator();
                if (iter != null && iter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                    found = true;//from   w  w w .j  a v a 2  s.  com
                    break;
                }

            }
        }
        return found;
    } else {
        return true;
    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

private void dumpPostings(IndexReader reader) throws IOException {
    // This is how you iterate through terms in the postings list.
    LeafReader leafReader = reader.leaves().get(0).reader();
    TermsEnum termsEnum = leafReader.terms("text").iterator();
    BytesRef bytesRef = termsEnum.next();
    while (bytesRef != null) {
        // This is the current term in the dictionary.
        String token = bytesRef.utf8ToString();
        Term term = new Term("text", token);
        System.out.print(token + " (df = " + reader.docFreq(term) + "):");

        PostingsEnum postingsEnum = leafReader.postings(term);
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            System.out.print(String.format(" (%s, %s)", postingsEnum.docID(), postingsEnum.freq()));
        }//from w  w w.  jav  a 2 s  .  c  o m
        System.out.println("");

        bytesRef = termsEnum.next();
    }
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testReadingPostings() throws Exception {
    Directory dir = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir);
    assertEquals(3, reader.numDocs());/*from   w  w  w .ja  va 2  s. c  om*/
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

@Test
public void testCloneIndex() throws Exception {
    System.out.println("Cloning index:");
    Directory dir1 = FSDirectory.open(tempDir1);
    IndexReader reader = DirectoryReader.open(dir1);

    Directory dir2 = FSDirectory.open(tempDir2);
    IndexWriterConfig config = new IndexWriterConfig(new EnglishAnalyzer());
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir2, config);

    LeafReader leafReader = reader.leaves().get(0).reader();
    CodecReader codecReader = SlowCodecReaderWrapper.wrap(leafReader);
    writer.addIndexes(new MyFilterCodecReader(codecReader));
    writer.commit();/*w w  w . j av a  2  s  .c  om*/
    writer.forceMerge(1);
    writer.close();

    reader.close();

    // Open up the cloned index and verify it.
    reader = DirectoryReader.open(dir2);
    assertEquals(3, reader.numDocs());
    assertEquals(1, reader.leaves().size());

    System.out.println("Dumping out postings...");
    dumpPostings(reader);

    assertEquals(2, reader.docFreq(new Term("text", "here")));
    assertEquals(2, reader.docFreq(new Term("text", "more")));
    assertEquals(1, reader.docFreq(new Term("text", "some")));
    assertEquals(1, reader.docFreq(new Term("text", "test")));
    assertEquals(2, reader.docFreq(new Term("text", "text")));

    reader.close();
}

From source file:io.yucca.lucene.FieldRemover.java

License:Apache License

/**
 * Remove fields from an index. All readers and writer are closed on
 * completion or on an exception.//w  ww .j av  a  2  s.  c  o m
 * 
 * @param reader
 *            IndexReader
 * @param writer
 *            IndexWriter
 *            File destination index directory
 * @param fields
 *            String[] fields to remove
 */
public void removeFields(IndexReader reader, IndexWriter writer, String[] fields) {
    Set<String> removals = toTrimmedSet(fields);
    List<AtomicReaderContext> leaves = reader.leaves();
    AtomicReader wrappedLeaves[] = new AtomicReader[leaves.size()];
    for (int i = 0; i < leaves.size(); i++) {
        wrappedLeaves[i] = new FieldFilterAtomicReader(leaves.get(i).reader(), removals, true);
    }
    try {
        MultiReader mr = new MultiReader(wrappedLeaves, true);
        writer.addIndexes(mr);
        writer.commit();
        writer.close();
        mr.close();
    } catch (IOException e) {
        log.error("Writing new index failed.", e);
    } finally {
        IOUtils.closeWhileHandlingException(reader);
        IOUtils.closeWhileHandlingException(writer);
        IOUtils.closeWhileHandlingException(writer.getDirectory());
    }
}

From source file:nl.inl.blacklab.search.HitsImpl.java

License:Apache License

/**
 * Construct a Hits object from a SpanQuery.
 *
 * @param searcher//from www .j  av a  2 s.com
 *            the searcher object
 * @param concordanceFieldPropName
 *            field to use by default when finding concordances
 * @param sourceQuery
 *            the query to execute to get the hits
 * @throws TooManyClauses if the query is overly broad (expands to too many terms)
 */
HitsImpl(Searcher searcher, SpanQuery sourceQuery) throws TooManyClauses {
    this(searcher, (List<Hit>) null);
    try {
        IndexReader reader = searcher.getIndexReader();
        spanQuery = (SpanQuery) sourceQuery.rewrite(reader);
        termContexts = new HashMap<>();
        Set<Term> terms = new HashSet<>();
        extractTermsFromSpanQuery(terms);
        etiquette = new ThreadPriority();
        for (Term term : terms) {
            try {
                etiquette.behave();
            } catch (InterruptedException e) {
                // Taking too long, break it off.
                // Not a very graceful way to do it... but at least it won't
                // be stuck forever.
                Thread.currentThread().interrupt(); // client can check this
                throw new RuntimeException("Query matches too many terms; aborted.");
            }
            termContexts.put(term, TermContext.build(reader.getContext(), term));
        }

        currentSourceSpans = null;
        atomicReaderContexts = reader == null ? null : reader.leaves();
        atomicReaderContextIndex = -1;
        //sourceSpans = BLSpansWrapper.optWrap(spanQuery.getSpans(srw != null ? srw.getContext() : null, srw != null ? srw.getLiveDocs() : null, termContexts));
    } catch (IOException e) {
        throw new RuntimeException(e);
    }

    sourceSpansFullyRead = false;
}