Example usage for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:org.zanata.hibernate.search.TextFlowIdFilter.java

License:Open Source License

@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
    for (Long tfId : textFlowIds) {
        Term term = new Term("id", tfId.toString());
        TermDocs termDocs = reader.termDocs(term);
        while (termDocs.next())
            bitSet.set(termDocs.doc());//  www .j a  v  a  2s .c  o  m
    }
    return bitSet;
}

From source file:perf.IndexAndSearchOpenStreetMaps.java

License:Apache License

private static void queryIndex(String queryClass, int gons, int nearestTopN, String polyFile,
        boolean preBuildQueries, Double filterPercent, boolean doDistanceSort) throws IOException {
    IndexSearcher[] searchers = new IndexSearcher[NUM_PARTS];
    Directory[] dirs = new Directory[NUM_PARTS];
    long sizeOnDisk = 0;
    for (int part = 0; part < NUM_PARTS; part++) {
        dirs[part] = FSDirectory.open(Paths.get(getName(part, doDistanceSort)));
        searchers[part] = new IndexSearcher(DirectoryReader.open(dirs[part]));
        searchers[part].setQueryCache(null);
        for (String name : dirs[part].listAll()) {
            sizeOnDisk += dirs[part].fileLength(name);
        }//from w ww. j  a v  a2 s  . co m
    }
    //plotBKD(searchers[0].getIndexReader());
    System.out.println("INDEX SIZE: " + (sizeOnDisk / 1024. / 1024. / 1024.) + " GB");
    long bytes = 0;
    long maxDoc = 0;
    for (IndexSearcher s : searchers) {
        IndexReader r = s.getIndexReader();
        maxDoc += r.maxDoc();
        for (LeafReaderContext ctx : r.leaves()) {
            CodecReader cr = (CodecReader) ctx.reader();
            /*
            for(Accountable acc : cr.getChildResources()) {
              System.out.println("  " + Accountables.toString(acc));
            }
            */
            bytes += cr.ramBytesUsed();
        }
    }
    System.out.println("READER MB: " + (bytes / 1024. / 1024.));
    System.out.println("maxDoc=" + maxDoc);

    double bestQPS = Double.NEGATIVE_INFINITY;

    // million hits per second:
    double bestMHPS = Double.NEGATIVE_INFINITY;

    if (queryClass.equals("polyFile")) {

        // TODO: only load the double[][] here, so that we includ the cost of making Polygon and Query in each iteration!!
        List<Polygon[]> polygons = readPolygons(polyFile);

        // Uncomment to find the lost points!!

        /*
        BooleanQuery.Builder b = new BooleanQuery.Builder();
        b.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
        for(Query q : queries) {
          b.add(q, BooleanClause.Occur.MUST_NOT);
        }
        searchers[0].search(b.build(), new SimpleCollector() {
            private int markerCount;
            private SortedNumericDocValues docValues;
                
            @Override
            protected void doSetNextReader(LeafReaderContext context) throws IOException {
              docValues = context.reader().getSortedNumericDocValues("point");
            }
                
            @Override
            public boolean needsScores() {
              return false;
            }
                
            @Override
            public void collect(int doc) {
              docValues.setDocument(doc);
              int count = docValues.count();
              for (int i = 0; i < count; i++) {
                long encoded = docValues.valueAt(i);
                double docLatitude = LatLonPoint.decodeLatitude((int)(encoded >> 32));
                double docLongitude = LatLonPoint.decodeLongitude((int)(encoded & 0xFFFFFFFF));
                System.out.println("        WE.marker([" + docLatitude + ", " + docLongitude + "]).addTo(earth);");
              }
            }
          });
        */

        /*
        {
          Query q = LatLonPoint.newBoxQuery("point", minLat, maxLat, minLon, maxLon);
          int totHits = 0;
                           
          for(IndexSearcher s : searchers) {
            int hitCount = s.count(q);
            totHits += hitCount;
          }
                
          System.out.println("Poly file bbox total hits: " + totHits);
        }
        */

        if (preBuildQueries) {
            System.out.println("\nUsing pre-built polygon queries, loaded from file " + polyFile);
            List<Query> queries = new ArrayList<>();
            for (Polygon[] multiPolygon : polygons) {
                Query q = null;
                if (useLatLonPoint) {
                    q = LatLonPoint.newPolygonQuery("point", multiPolygon);
                } else if (useGeoPoint) {
                    q = new GeoPointInPolygonQuery("point", multiPolygon);
                } else if (useGeo3DLarge) {
                    q = Geo3DPoint.newLargePolygonQuery("point", multiPolygon);
                } else if (useGeo3D) {
                    q = Geo3DPoint.newPolygonQuery("point", multiPolygon);
                }
                queries.add(q);
            }

            double[] result = runQueries(searchers, queries);
            bestQPS = result[0];
            bestMHPS = result[1];

        } else {

            System.out.println("\nUsing on-the-fly polygon queries, loaded from file " + polyFile);

            for (int iter = 0; iter < ITERS; iter++) {
                long tStart = System.nanoTime();
                long totHits = 0;
                int queryCount = 0;
                for (Polygon[] multiPolygon : polygons) {

                    // We do this to keep the benchmark honest, so any construction cost of a polygon is included in our run time measure:
                    multiPolygon = clonePolygon(multiPolygon);

                    Query q;
                    if (useLatLonPoint) {
                        q = LatLonPoint.newPolygonQuery("point", multiPolygon);
                    } else if (useGeoPoint) {
                        q = new GeoPointInPolygonQuery("point", multiPolygon);
                    } else {
                        q = Geo3DPoint.newLargePolygonQuery("point", multiPolygon);
                    }

                    for (IndexSearcher s : searchers) {
                        int hitCount = s.count(q);
                        totHits += hitCount;
                    }
                    queryCount++;
                }

                long tEnd = System.nanoTime();
                double elapsedSec = (tEnd - tStart) / 1000000000.0;
                double qps = queryCount / elapsedSec;
                double mhps = (totHits / 1000000.0) / elapsedSec;
                System.out.println(String.format(Locale.ROOT,
                        "ITER %d: %.2f M hits/sec, %.2f QPS (%.2f sec for %d queries), totHits=%d", iter, mhps,
                        qps, elapsedSec, queryCount, totHits));
                if (qps > bestQPS) {
                    System.out.println("  ***");
                    bestQPS = qps;
                    bestMHPS = mhps;
                }
            }
        }

    } else if (preBuildQueries) {
        System.out.println("\nUsing pre-built queries");

        double[] result = runQueries(searchers, makeQueries(queryClass, gons));
        bestQPS = result[0];
        bestMHPS = result[1];

    } else {
        System.out.println("\nUsing on-the-fly queries");

        // Create regularly spaced shapes in a grid around London, UK:
        int STEPS = 5;
        double MIN_LAT = 51.0919106;
        double MAX_LAT = 51.6542719;
        double MIN_LON = -0.3867282;
        double MAX_LON = 0.8492337;

        // makeRegularPoly has insanely slow math, so make the double[]'s here.
        // we still form the query inside the benchmark loop (e.g. to account for preprocessing)
        ArrayList<double[][]> polys = new ArrayList<double[][]>(225);
        if ("poly".equals(queryClass)) {
            for (int latStep = 0; latStep < STEPS; latStep++) {
                double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS;
                for (int lonStep = 0; lonStep < STEPS; lonStep++) {
                    double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS;
                    for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) {
                        double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS;
                        for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) {
                            double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS;
                            double distanceMeters = SloppyMath.haversinMeters(lat, lon, latEnd, lonEnd) / 2.0;
                            double centerLat = (lat + latEnd) / 2.0;
                            double centerLon = (lon + lonEnd) / 2.0;
                            polys.add(makeRegularPoly(centerLat, centerLon, distanceMeters, gons));
                        }
                    }
                }
            }
        }

        for (int iter = 0; iter < ITERS; iter++) {
            long tStart = System.nanoTime();
            long totHits = 0;
            double totNearestDistance = 0.0;
            int queryCount = 0;

            for (int latStep = 0; latStep < STEPS; latStep++) {
                double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS;
                for (int lonStep = 0; lonStep < STEPS; lonStep++) {
                    double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS;
                    for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) {
                        double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS;
                        for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) {
                            double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS;

                            double distanceMeters = SloppyMath.haversinMeters(lat, lon, latEnd, lonEnd) / 2.0;
                            double centerLat = (lat + latEnd) / 2.0;
                            double centerLon = (lon + lonEnd) / 2.0;
                            ScoreDoc[] nearestHits = null;
                            Query q = null;

                            switch (queryClass) {
                            case "distance":
                                if (useGeo3D || useGeo3DLarge) {
                                    q = Geo3DPoint.newDistanceQuery("point", centerLat, centerLon,
                                            distanceMeters);
                                } else if (useLatLonPoint) {
                                    q = LatLonPoint.newDistanceQuery("point", centerLat, centerLon,
                                            distanceMeters);
                                } else if (useGeoPoint) {
                                    q = new GeoPointDistanceQuery("point", centerLat, centerLon,
                                            distanceMeters);
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            case "poly":
                                double[][] poly = polys.get(queryCount);
                                //System.out.println("poly lats: " + Arrays.toString(poly[0]));
                                //System.out.println("poly lons: " + Arrays.toString(poly[1]));
                                if (useGeo3DLarge) {
                                    //System.out.println("POLY:\n  lats=" + Arrays.toString(poly[0]) + "\n  lons=" + Arrays.toString(poly[1]));
                                    q = Geo3DPoint.newLargePolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else if (useGeo3D) {
                                    q = Geo3DPoint.newPolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else if (useLatLonPoint) {
                                    q = LatLonPoint.newPolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else if (useGeoPoint) {
                                    q = new GeoPointInPolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            case "box":
                                if (useGeo3D || useGeo3DLarge) {
                                    q = Geo3DPoint.newBoxQuery("point", lat, latEnd, lon, lonEnd);
                                } else if (useLatLonPoint) {
                                    q = LatLonPoint.newBoxQuery("point", lat, latEnd, lon, lonEnd);
                                } else if (useGeoPoint) {
                                    q = new GeoPointInBBoxQuery("point", lat, latEnd, lon, lonEnd);
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            case "nearest":
                                if (useLatLonPoint) {
                                    if (searchers.length != 1) {
                                        // TODO
                                        throw new AssertionError();
                                    }
                                    nearestHits = LatLonPoint.nearest(searchers[0], "point",
                                            (lat + latEnd) / 2.0, (lon + lonEnd) / 2.0, nearestTopN).scoreDocs;
                                    if (false && iter == 0) {
                                        System.out.println("\n" + nearestHits.length + " nearest:");
                                        for (ScoreDoc hit : nearestHits) {
                                            System.out.println("  " + ((FieldDoc) hit).fields[0]);
                                        }
                                    }
                                    for (ScoreDoc hit : nearestHits) {
                                        totNearestDistance += (Double) ((FieldDoc) hit).fields[0];
                                    }
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            default:
                                throw new AssertionError("unknown queryClass " + queryClass);
                            }

                            // TODO: do this somewhere else?
                            if (filterPercent != null) {
                                BooleanQuery.Builder builder = new BooleanQuery.Builder();
                                builder.add(q, BooleanClause.Occur.MUST);
                                builder.add(new RandomQuery(filterPercent), BooleanClause.Occur.FILTER);
                                q = builder.build();
                            }

                            if (q != null) {
                                if (doDistanceSort) {
                                    Sort sort = new Sort(LatLonDocValuesField.newDistanceSort("point",
                                            centerLat, centerLon));
                                    for (IndexSearcher s : searchers) {
                                        TopFieldDocs hits = s.search(q, 10, sort);
                                        totHits += hits.totalHits;
                                    }
                                } else {
                                    //System.out.println("\nRUN QUERY " + q);
                                    //long t0 = System.nanoTime();
                                    for (IndexSearcher s : searchers) {
                                        int hitCount = s.count(q);
                                        totHits += hitCount;
                                        if (false && iter == 0) {
                                            System.out.println("q=" + q + " lat=" + centerLat + " lon="
                                                    + centerLon + " distanceMeters=" + distanceMeters
                                                    + " hits: " + hitCount);
                                        }
                                    }
                                }
                            } else {
                                assert nearestHits != null;
                                totHits += nearestHits.length;
                            }
                            queryCount++;
                            //throw new RuntimeException("now stop");
                        }
                    }
                }
            }

            long tEnd = System.nanoTime();
            double elapsedSec = (tEnd - tStart) / 1000000000.0;
            double qps = queryCount / elapsedSec;
            double mhps = (totHits / 1000000.0) / elapsedSec;
            if (queryClass.equals("nearest")) {
                System.out.println(String.format(Locale.ROOT,
                        "ITER %d: %.2f QPS (%.2f sec for %d queries), totNearestDistance=%.10f, totHits=%d",
                        iter, qps, elapsedSec, queryCount, totNearestDistance, maxDoc));
            } else {
                System.out.println(String.format(Locale.ROOT,
                        "ITER %d: %.2f M hits/sec, %.2f QPS (%.2f sec for %d queries), totHits=%d", iter, mhps,
                        qps, elapsedSec, queryCount, totHits));
            }
            if (qps > bestQPS) {
                System.out.println("  ***");
                bestQPS = qps;
                bestMHPS = mhps;
            }
        }
    }
    System.out.println("BEST M hits/sec: " + bestMHPS);
    System.out.println("BEST QPS: " + bestQPS);

    for (IndexSearcher s : searchers) {
        s.getIndexReader().close();
    }
    IOUtils.close(dirs);
}

From source file:perf.IndexAndSearchOpenStreetMapsGeo3D.java

License:Apache License

private static void queryIndex() throws IOException {
    Directory dir = FSDirectory.open(Paths.get("bkdtestgeo3d"));
    System.out.println("DIR: " + dir);
    IndexReader r = DirectoryReader.open(dir);
    System.out.println("maxDoc=" + r.maxDoc());
    IndexSearcher s = new IndexSearcher(r);
    //SegmentReader sr = (SegmentReader) r.leaves().get(0).reader();
    //BKDTreeReader reader = ((BKDTreeSortedNumericDocValues) sr.getSortedNumericDocValues("point")).getBKDTreeReader();

    //System.out.println("reader MB heap=" + (reader.ramBytesUsed()/1024/1024.));

    // London, UK:
    int STEPS = 5;
    double MIN_LAT = 51.0919106;
    double MAX_LAT = 51.6542719;
    double MIN_LON = -0.3867282;
    double MAX_LON = 0.8492337;
    for (int iter = 0; iter < 100; iter++) {
        long tStart = System.nanoTime();
        long totHits = 0;
        int queryCount = 0;
        for (int latStep = 0; latStep < STEPS; latStep++) {
            double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS;
            for (int lonStep = 0; lonStep < STEPS; lonStep++) {
                double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS;
                for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) {
                    double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS;
                    for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) {
                        double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS;

                        Query q = new PointInGeo3DShapeQuery(PlanetModel.WGS84, "point",
                                GeoBBoxFactory.makeGeoBBox(PlanetModel.WGS84, toRadians(latEnd), toRadians(lat),
                                        toRadians(lon), toRadians(lonEnd)));
                        TotalHitCountCollector c = new TotalHitCountCollector();
                        //long t0 = System.nanoTime();
                        s.search(q, c);//from  w  w w  . j  a  va 2  s . co m

                        //System.out.println("\nITER: now query lat=" + lat + " latEnd=" + latEnd + " lon=" + lon + " lonEnd=" + lonEnd);
                        //Bits hits = reader.intersect(lat, latEnd, lon, lonEnd);
                        //System.out.println("  total hits: " + hitCount);
                        //totHits += ((FixedBitSet) hits).cardinality();
                        //System.out.println("  add tot " + c.getTotalHits());
                        totHits += c.getTotalHits();
                        queryCount++;
                    }
                }
            }
        }

        long tEnd = System.nanoTime();
        System.out.println("ITER: " + iter + " " + ((tEnd - tStart) / 1000000000.0) + " sec; totHits=" + totHits
                + "; " + queryCount + " queries");
    }

    IOUtils.close(r, dir);
}

From source file:perf.PKLookupPerfTest3X.java

License:Apache License

public static void main(String[] args) throws IOException {

    final Directory dir;
    final String dirImpl = args[0];
    final String dirPath = args[1];
    final int numDocs = Integer.parseInt(args[2]);
    final int numLookups = Integer.parseInt(args[3]);
    final long seed = Long.parseLong(args[4]);

    if (dirImpl.equals("MMapDirectory")) {
        dir = new MMapDirectory(new File(dirPath));
    } else if (dirImpl.equals("NIOFSDirectory")) {
        dir = new NIOFSDirectory(new File(dirPath));
    } else if (dirImpl.equals("SimpleFSDirectory")) {
        dir = new SimpleFSDirectory(new File(dirPath));
    } else {/*ww  w  .  j  a va 2 s .com*/
        throw new RuntimeException("unknown directory impl \"" + dirImpl + "\"");
    }

    if (!new File(dirPath).exists()) {
        createIndex(dir, numDocs);
    }

    final IndexReader r = IndexReader.open(dir);
    System.out.println("Reader=" + r);

    final IndexReader[] subs = r.getSequentialSubReaders();
    final TermDocs[] termDocsArr = new TermDocs[subs.length];
    for (int subIdx = 0; subIdx < subs.length; subIdx++) {
        termDocsArr[subIdx] = subs[subIdx].termDocs();
    }

    final int maxDoc = r.maxDoc();
    final Random rand = new Random(seed);

    for (int cycle = 0; cycle < 10; cycle++) {
        System.out.println("Cycle: " + (cycle == 0 ? "warm" : "test"));
        System.out.println("  Lookup...");
        final Term[] lookup = new Term[numLookups];
        final int[] docIDs = new int[numLookups];
        final Term protoTerm = new Term("id");
        for (int iter = 0; iter < numLookups; iter++) {
            // Base 36, prefixed with 0s to be length 6 (= 2.2 B)
            lookup[iter] = protoTerm.createTerm(
                    String.format("%6s", Integer.toString(rand.nextInt(maxDoc), Character.MAX_RADIX))
                            .replace(' ', '0'));
        }
        Arrays.fill(docIDs, -1);

        final AtomicBoolean failed = new AtomicBoolean(false);

        final Term t = new Term("id", "");

        final long tStart = System.currentTimeMillis();
        for (int iter = 0; iter < numLookups; iter++) {
            //System.out.println("lookup " + lookup[iter].utf8ToString());
            int base = 0;
            int found = 0;
            for (int subIdx = 0; subIdx < subs.length; subIdx++) {
                final IndexReader sub = subs[subIdx];
                if (!DO_DOC_LOOKUP) {
                    final int df = sub.docFreq(lookup[iter]);
                    if (df != 0) {
                        if (df != 1) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            System.out.println("FAIL0");
                            failed.set(true);
                        }
                    }
                } else {
                    final TermDocs termDocs = termDocsArr[subIdx];
                    termDocs.seek(lookup[iter]);
                    if (termDocs.next()) {
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            failed.set(true);
                        }
                        final int docID = termDocs.doc();
                        if (docIDs[iter] != -1) {
                            // Same doc should only be seen once
                            failed.set(true);
                        }
                        docIDs[iter] = base + docID;
                        if (termDocs.next()) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                    }
                }
                base += sub.maxDoc();
            }
        }
        final long tLookup = (System.currentTimeMillis() - tStart);

        // cycle 0 is for warming
        //System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000*tLookup/numLookups) + " us per lookup) + totSeekMS=" + (BlockTermsReader.totSeekNanos/1000000.));
        System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups
                + " lookups (" + (1000.0 * tLookup / numLookups) + " us per lookup)");

        if (failed.get()) {
            throw new RuntimeException("at least one lookup produced more than one result");
        }

        if (DO_DOC_LOOKUP) {
            System.out.println("  Verify...");
            for (int iter = 0; iter < numLookups; iter++) {
                if (docIDs[iter] == -1) {
                    throw new RuntimeException("lookup of " + lookup[iter] + " failed iter=" + iter);
                }
                final String found = r.document(docIDs[iter]).get("id");
                if (!found.equals(lookup[iter].text())) {
                    throw new RuntimeException(
                            "lookup of docid=" + lookup[iter].text() + " hit wrong docid=" + found);
                }
            }
        }
    }

    // System.out.println("blocks=" + BlockTermsReader.totBlockReadCount + " scans=" + BlockTermsReader.totScanCount + " " + (((float) BlockTermsReader.totScanCount))/(BlockTermsReader.totBlockReadCount) + " scans/block");

    r.close();
    dir.close();
}

From source file:perf.RandomFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(IndexReader reader) {
    final Random rand = new Random(42);
    final int maxDoc = reader.maxDoc();
    OpenBitSet bits = new OpenBitSet(maxDoc);
    for (int docID = 0; docID < maxDoc; docID++) {
        if (rand.nextDouble() <= pctKeep) {
            bits.fastSet(docID);//  w w w.  j a  v  a2s  .c  o m
        }
    }

    System.out.println("rfilt " + bits.cardinality());
    return bits;
}

From source file:pretraga.IsolationSimilarity.java

public void test(String vec) {
    List<String> vector = processInput(vec);
    HashMap<String, Long> map = new HashMap<>();
    try {/*from  www  .  j  av  a  2s.  c  o m*/
        Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath());

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);

        List<Integer> docId = getDocumentsFromVector(vector, reader, searcher);

        for (int i = 0; i < docId.size(); i++) {
            Fields ff = reader.getTermVectors(docId.get(i));
            Terms terms = ff.terms(CONTENT);

            TermsEnum te = terms.iterator();
            Object tmp = te.next();
            while (tmp != null) {
                BytesRef by = (BytesRef) tmp;
                String term = by.utf8ToString();

                ClassicSimilarity sim = null;
                if (searcher.getSimilarity(true) instanceof ClassicSimilarity) {
                    sim = (ClassicSimilarity) searcher.getSimilarity(true);
                }
                float idf = sim.idf(te.docFreq(), reader.maxDoc());
                float tf = sim.tf(te.totalTermFreq());
                //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq());
                TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq());
                CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(),
                        terms.getSumTotalTermFreq(), terms.getSumDocFreq());
                Document d = reader.document(docId.get(i));
                if (vector.contains(term)) {
                    float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0))
                            .score(docId.get(i), te.totalTermFreq());
                    System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term);
                }
                tmp = te.next();
            }

            /*Iterator<String> ss = ff.iterator();
            while (ss.hasNext()) {
            String fieldString = ss.next();
            System.out.println(fieldString);
            }*/
        }
    } catch (Exception e) {

    }
}

From source file:proj.zoie.api.ZoieSegmentReader.java

License:Apache License

private void init(IndexReader reader) throws IOException {
    int maxDoc = reader.maxDoc();
    _uidArray = new long[maxDoc];
    TermPositions tp = null;/* w  ww. j a va  2  s  .c o  m*/
    byte[] payloadBuffer = new byte[8]; // four bytes for a long
    try {
        tp = reader.termPositions(UID_TERM);
        int idx = 0;
        while (tp.next()) {
            int doc = tp.doc();
            assert doc < maxDoc;

            while (idx < doc)
                _uidArray[idx++] = DELETED_UID; // fill the gap

            tp.nextPosition();
            tp.getPayload(payloadBuffer, 0);
            long uid = bytesToLong(payloadBuffer);
            if (uid < _minUID)
                _minUID = uid;
            if (uid > _maxUID)
                _maxUID = uid;
            _uidArray[idx++] = uid;
        }
        while (idx < maxDoc)
            _uidArray[idx++] = DELETED_UID; // fill the gap
    } finally {
        if (tp != null) {
            tp.close();
        }
    }
}

From source file:psidev.psi.mi.search.engine.impl.AbstractSearchEngine.java

License:Apache License

public SearchResult<T> searchAll(Integer firstResult, Integer maxResults) throws SearchEngineException {
    if (firstResult == null)
        firstResult = 0;// w ww .j  a  v a  2s . co m
    if (maxResults == null)
        maxResults = MAX_TOP_RESULTS;

    IndexReader reader = indexSearcher.getIndexReader();

    int totalCount = reader.maxDoc();

    if (maxResults == 0) {
        return new SearchResult(Collections.EMPTY_LIST, totalCount, firstResult, maxResults,
                new WildcardQuery(new Term("", "*")));
    }

    // this is a hack to ignore any header introduced in the index by mistake (first development versions)
    if (reader.isDeleted(0)) {
        firstResult++;
        totalCount--;
    }

    if (firstResult > totalCount) {
        //            closeIndexReader(reader);
        return new SearchResult(Collections.EMPTY_LIST, totalCount, firstResult, maxResults,
                new WildcardQuery(new Term("", "*")));
    }

    int maxIndex = Math.min(totalCount, firstResult + maxResults);

    List<T> dataObjects = new ArrayList<T>();

    for (int i = firstResult; i < maxIndex; i++) {
        try {
            Document doc = reader.document(i);
            T data = (T) createDocumentBuilder().createData(doc);
            dataObjects.add(data);
        } catch (Exception e) {
            //                closeIndexReader(reader);
            throw new SearchEngineException(e);
        }
    }

    //        closeIndexReader(reader);
    return new SearchResult(dataObjects, totalCount, firstResult, maxResults,
            new WildcardQuery(new Term("", "*")));
}

From source file:solr2155.lucene.spatial.geohash.GeoHashPrefixFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
    final TermsEnumCompatibility termsEnum = new TermsEnumCompatibility(reader, fieldName);//Lucene 4 compatibility wrapper
    final TermDocs termDocs = reader.termDocs();
    Term term = termsEnum.term();//the most recent term examined via termsEnum.term()
    if (term == null)
        return bits;

    //TODO Add a precision short-circuit so that we are not accurate on the edge but we're faster.

    //TODO An array based nodes impl would be more efficient; or a stack of iterators.  LinkedList conveniently has bulk add to beginning.
    LinkedList<GridNode> nodes = new LinkedList<GridNode>(
            gridReferenceSystem.getSubNodes(geoShape.boundingRectangle()));
    while (!nodes.isEmpty() && term != null) {
        final GridNode node = nodes.removeFirst();
        assert node.length() > 0;
        if (!node.contains(term.text()) && node.before(term.text()))
            continue;//short circuit, moving >= the next indexed term
        IntersectCase intersection = geoShape.intersect(node.getRectangle());
        if (intersection == IntersectCase.OUTSIDE)
            continue;
        TermsEnumCompatibility.SeekStatus seekStat = termsEnum.seek(node.getTermVal());
        term = termsEnum.term();/*from  w w  w . j  a  v a2 s . c o m*/
        if (seekStat != TermsEnumCompatibility.SeekStatus.FOUND)
            continue;
        if (intersection == IntersectCase.CONTAINS) {
            termDocs.seek(term);
            addDocs(termDocs, bits);
            term = termsEnum.next();//move to next term
        } else {//any other intersection
            //TODO is it worth it to optimize the shape (e.g. potentially simpler polygon)?
            //GeoShape geoShape = this.geoShape.optimize(intersection);

            //We either scan through the leaf node(s), or if there are many points then we divide & conquer.
            boolean manyPoints = node.length() < gridReferenceSystem.maxLen - GRIDLEN_SCAN_THRESHOLD;

            //TODO Try variable depth strategy:
            //IF configured to do so, we could use term.freq() as an estimate on the number of places at this depth.  OR, perhaps
            //  make estimates based on the total known term count at this level?  Or don't worry about it--use fixed depth.
            //        if (manyPoints) {
            //          //Make some estimations on how many points there are at this level and how few there would need to be to set
            //          // manyPoints to false.
            //
            //          long termsThreshold = (long) estimateNumberIndexedTerms(node.length(),geoShape.getDocFreqExpenseThreshold(node));
            //
            //          long thisOrd = termsEnum.ord();
            //          manyPoints = (termsEnum.seek(thisOrd+termsThreshold+1) != TermsEnum.SeekStatus.END
            //                  && node.contains(termsEnum.term()));
            //          termsEnum.seek(thisOrd);//return to last position
            //        }

            if (!manyPoints) {
                //traverse all leaf terms within this node to see if they are within the geoShape, one by one.
                for (; term != null && node.contains(term.text()); term = termsEnum.next()) {
                    if (term.text().length() < gridReferenceSystem.maxLen)//not a leaf
                        continue;
                    final Point2D point = gridReferenceSystem.decodeXY(term.text());
                    //Filter those out of the shape.
                    if (!geoShape.contains(point))
                        continue;

                    //record
                    termDocs.seek(term);
                    addDocs(termDocs, bits);
                }
            } else {
                //divide & conquer
                nodes.addAll(0, node.getSubNodes());//add to beginning
            }
        }
    } //node loop

    return bits;
}

From source file:solr2155.solr.search.function.GeoHashValueSource.java

License:Apache License

@SuppressWarnings({ "unchecked" })
GeoHashValueSource(String fieldName, SolrIndexSearcher searcher) throws IOException {
    log.info("Loading geohash field " + fieldName + " into memory.");
    this.fieldName = fieldName;

    //Get gridReferenceSystem
    final GridNode.GridReferenceSystem gridReferenceSystem;
    FieldType fieldType = searcher.getSchema().getField(fieldName).getType();
    if (fieldType instanceof GeoHashField) {
        gridReferenceSystem = ((GeoHashField) fieldType).getGridReferenceSystem();
    } else//from  www  . j  a v  a2 s.  co m
        throw new RuntimeException(
                "field " + fieldName + " should be a GeoHashField, not " + fieldType.getTypeName());

    //Traverse the index to load up doc2PointsCache
    IndexReader reader = searcher.getIndexReader();
    TermsEnumCompatibility termsEnum = new TermsEnumCompatibility(reader, fieldName);
    TermDocs termDocs = reader.termDocs(); //cached for termsEnum.docs() calls
    try {
        while (true) {
            final Term term = termsEnum.next();
            if (term == null)
                break;
            if (term.text().length() != gridReferenceSystem.getPrecision())
                continue;
            Point2D point = gridReferenceSystem.decodeXY(term.text());
            termDocs.seek(termsEnum.getTermEnum());
            while (termDocs.next()) {
                final int docId = termDocs.doc();
                if (docId == DocIdSetIterator.NO_MORE_DOCS)
                    break;
                if (doc2PointsCache == null)
                    doc2PointsCache = (List<Point2D>[]) new List[reader.maxDoc()];//java generics hack
                List<Point2D> points = doc2PointsCache[docId];
                if (points == null) {
                    points = new ArrayList<Point2D>(DEFAULT_ARRAY_CAPACITY);
                    doc2PointsCache[docId] = points;
                }
                points.add(point);
            }
        }
    } finally { // in Lucene 3 these should be closed (not in Lucene 4)
        termDocs.close();
        termsEnum.close();
    }

    //Log statistics
    if (log.isInfoEnabled()) {
        int min = Integer.MAX_VALUE, sum = 0, max = 0;
        int dlen = 0;
        if (doc2PointsCache != null) {
            dlen = doc2PointsCache.length;
            for (List<Point2D> point2Ds : doc2PointsCache) {
                int plen = (point2Ds == null ? 0 : point2Ds.size());
                min = Math.min(min, plen);
                max = Math.max(max, plen);
                sum += plen;
            }
        }
        if (min == Integer.MAX_VALUE)
            min = 0;
        float avg = (float) sum / dlen;
        log.info("field '" + fieldName + "' in RAM: loaded min/avg/max per doc #: (" + min + "," + avg + ","
                + max + ") #" + dlen);
    }
}