Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:org.zanata.hibernate.search.TextFlowIdFilter.java

License:Open Source License

@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    OpenBitSet bitSet = new OpenBitSet(reader.maxDoc());
    for (Long tfId : textFlowIds) {
        Term term = new Term("id", tfId.toString());
        TermDocs termDocs = reader.termDocs(term);
        while (termDocs.next())
            bitSet.set(termDocs.doc());//  www .j a  v  a  2s .c  o  m
    }
    return bitSet;
}

From source file:perf.IndexAndSearchOpenStreetMaps.java

License:Apache License

private static void queryIndex(String queryClass, int gons, int nearestTopN, String polyFile,
        boolean preBuildQueries, Double filterPercent, boolean doDistanceSort) throws IOException {
    IndexSearcher[] searchers = new IndexSearcher[NUM_PARTS];
    Directory[] dirs = new Directory[NUM_PARTS];
    long sizeOnDisk = 0;
    for (int part = 0; part < NUM_PARTS; part++) {
        dirs[part] = FSDirectory.open(Paths.get(getName(part, doDistanceSort)));
        searchers[part] = new IndexSearcher(DirectoryReader.open(dirs[part]));
        searchers[part].setQueryCache(null);
        for (String name : dirs[part].listAll()) {
            sizeOnDisk += dirs[part].fileLength(name);
        }//from w ww. j  a v  a2 s  . co m
    }
    //plotBKD(searchers[0].getIndexReader());
    System.out.println("INDEX SIZE: " + (sizeOnDisk / 1024. / 1024. / 1024.) + " GB");
    long bytes = 0;
    long maxDoc = 0;
    for (IndexSearcher s : searchers) {
        IndexReader r = s.getIndexReader();
        maxDoc += r.maxDoc();
        for (LeafReaderContext ctx : r.leaves()) {
            CodecReader cr = (CodecReader) ctx.reader();
            /*
            for(Accountable acc : cr.getChildResources()) {
              System.out.println("  " + Accountables.toString(acc));
            }
            */
            bytes += cr.ramBytesUsed();
        }
    }
    System.out.println("READER MB: " + (bytes / 1024. / 1024.));
    System.out.println("maxDoc=" + maxDoc);

    double bestQPS = Double.NEGATIVE_INFINITY;

    // million hits per second:
    double bestMHPS = Double.NEGATIVE_INFINITY;

    if (queryClass.equals("polyFile")) {

        // TODO: only load the double[][] here, so that we includ the cost of making Polygon and Query in each iteration!!
        List<Polygon[]> polygons = readPolygons(polyFile);

        // Uncomment to find the lost points!!

        /*
        BooleanQuery.Builder b = new BooleanQuery.Builder();
        b.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
        for(Query q : queries) {
          b.add(q, BooleanClause.Occur.MUST_NOT);
        }
        searchers[0].search(b.build(), new SimpleCollector() {
            private int markerCount;
            private SortedNumericDocValues docValues;
                
            @Override
            protected void doSetNextReader(LeafReaderContext context) throws IOException {
              docValues = context.reader().getSortedNumericDocValues("point");
            }
                
            @Override
            public boolean needsScores() {
              return false;
            }
                
            @Override
            public void collect(int doc) {
              docValues.setDocument(doc);
              int count = docValues.count();
              for (int i = 0; i < count; i++) {
                long encoded = docValues.valueAt(i);
                double docLatitude = LatLonPoint.decodeLatitude((int)(encoded >> 32));
                double docLongitude = LatLonPoint.decodeLongitude((int)(encoded & 0xFFFFFFFF));
                System.out.println("        WE.marker([" + docLatitude + ", " + docLongitude + "]).addTo(earth);");
              }
            }
          });
        */

        /*
        {
          Query q = LatLonPoint.newBoxQuery("point", minLat, maxLat, minLon, maxLon);
          int totHits = 0;
                           
          for(IndexSearcher s : searchers) {
            int hitCount = s.count(q);
            totHits += hitCount;
          }
                
          System.out.println("Poly file bbox total hits: " + totHits);
        }
        */

        if (preBuildQueries) {
            System.out.println("\nUsing pre-built polygon queries, loaded from file " + polyFile);
            List<Query> queries = new ArrayList<>();
            for (Polygon[] multiPolygon : polygons) {
                Query q = null;
                if (useLatLonPoint) {
                    q = LatLonPoint.newPolygonQuery("point", multiPolygon);
                } else if (useGeoPoint) {
                    q = new GeoPointInPolygonQuery("point", multiPolygon);
                } else if (useGeo3DLarge) {
                    q = Geo3DPoint.newLargePolygonQuery("point", multiPolygon);
                } else if (useGeo3D) {
                    q = Geo3DPoint.newPolygonQuery("point", multiPolygon);
                }
                queries.add(q);
            }

            double[] result = runQueries(searchers, queries);
            bestQPS = result[0];
            bestMHPS = result[1];

        } else {

            System.out.println("\nUsing on-the-fly polygon queries, loaded from file " + polyFile);

            for (int iter = 0; iter < ITERS; iter++) {
                long tStart = System.nanoTime();
                long totHits = 0;
                int queryCount = 0;
                for (Polygon[] multiPolygon : polygons) {

                    // We do this to keep the benchmark honest, so any construction cost of a polygon is included in our run time measure:
                    multiPolygon = clonePolygon(multiPolygon);

                    Query q;
                    if (useLatLonPoint) {
                        q = LatLonPoint.newPolygonQuery("point", multiPolygon);
                    } else if (useGeoPoint) {
                        q = new GeoPointInPolygonQuery("point", multiPolygon);
                    } else {
                        q = Geo3DPoint.newLargePolygonQuery("point", multiPolygon);
                    }

                    for (IndexSearcher s : searchers) {
                        int hitCount = s.count(q);
                        totHits += hitCount;
                    }
                    queryCount++;
                }

                long tEnd = System.nanoTime();
                double elapsedSec = (tEnd - tStart) / 1000000000.0;
                double qps = queryCount / elapsedSec;
                double mhps = (totHits / 1000000.0) / elapsedSec;
                System.out.println(String.format(Locale.ROOT,
                        "ITER %d: %.2f M hits/sec, %.2f QPS (%.2f sec for %d queries), totHits=%d", iter, mhps,
                        qps, elapsedSec, queryCount, totHits));
                if (qps > bestQPS) {
                    System.out.println("  ***");
                    bestQPS = qps;
                    bestMHPS = mhps;
                }
            }
        }

    } else if (preBuildQueries) {
        System.out.println("\nUsing pre-built queries");

        double[] result = runQueries(searchers, makeQueries(queryClass, gons));
        bestQPS = result[0];
        bestMHPS = result[1];

    } else {
        System.out.println("\nUsing on-the-fly queries");

        // Create regularly spaced shapes in a grid around London, UK:
        int STEPS = 5;
        double MIN_LAT = 51.0919106;
        double MAX_LAT = 51.6542719;
        double MIN_LON = -0.3867282;
        double MAX_LON = 0.8492337;

        // makeRegularPoly has insanely slow math, so make the double[]'s here.
        // we still form the query inside the benchmark loop (e.g. to account for preprocessing)
        ArrayList<double[][]> polys = new ArrayList<double[][]>(225);
        if ("poly".equals(queryClass)) {
            for (int latStep = 0; latStep < STEPS; latStep++) {
                double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS;
                for (int lonStep = 0; lonStep < STEPS; lonStep++) {
                    double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS;
                    for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) {
                        double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS;
                        for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) {
                            double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS;
                            double distanceMeters = SloppyMath.haversinMeters(lat, lon, latEnd, lonEnd) / 2.0;
                            double centerLat = (lat + latEnd) / 2.0;
                            double centerLon = (lon + lonEnd) / 2.0;
                            polys.add(makeRegularPoly(centerLat, centerLon, distanceMeters, gons));
                        }
                    }
                }
            }
        }

        for (int iter = 0; iter < ITERS; iter++) {
            long tStart = System.nanoTime();
            long totHits = 0;
            double totNearestDistance = 0.0;
            int queryCount = 0;

            for (int latStep = 0; latStep < STEPS; latStep++) {
                double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS;
                for (int lonStep = 0; lonStep < STEPS; lonStep++) {
                    double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS;
                    for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) {
                        double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS;
                        for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) {
                            double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS;

                            double distanceMeters = SloppyMath.haversinMeters(lat, lon, latEnd, lonEnd) / 2.0;
                            double centerLat = (lat + latEnd) / 2.0;
                            double centerLon = (lon + lonEnd) / 2.0;
                            ScoreDoc[] nearestHits = null;
                            Query q = null;

                            switch (queryClass) {
                            case "distance":
                                if (useGeo3D || useGeo3DLarge) {
                                    q = Geo3DPoint.newDistanceQuery("point", centerLat, centerLon,
                                            distanceMeters);
                                } else if (useLatLonPoint) {
                                    q = LatLonPoint.newDistanceQuery("point", centerLat, centerLon,
                                            distanceMeters);
                                } else if (useGeoPoint) {
                                    q = new GeoPointDistanceQuery("point", centerLat, centerLon,
                                            distanceMeters);
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            case "poly":
                                double[][] poly = polys.get(queryCount);
                                //System.out.println("poly lats: " + Arrays.toString(poly[0]));
                                //System.out.println("poly lons: " + Arrays.toString(poly[1]));
                                if (useGeo3DLarge) {
                                    //System.out.println("POLY:\n  lats=" + Arrays.toString(poly[0]) + "\n  lons=" + Arrays.toString(poly[1]));
                                    q = Geo3DPoint.newLargePolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else if (useGeo3D) {
                                    q = Geo3DPoint.newPolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else if (useLatLonPoint) {
                                    q = LatLonPoint.newPolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else if (useGeoPoint) {
                                    q = new GeoPointInPolygonQuery("point", new Polygon(poly[0], poly[1]));
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            case "box":
                                if (useGeo3D || useGeo3DLarge) {
                                    q = Geo3DPoint.newBoxQuery("point", lat, latEnd, lon, lonEnd);
                                } else if (useLatLonPoint) {
                                    q = LatLonPoint.newBoxQuery("point", lat, latEnd, lon, lonEnd);
                                } else if (useGeoPoint) {
                                    q = new GeoPointInBBoxQuery("point", lat, latEnd, lon, lonEnd);
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            case "nearest":
                                if (useLatLonPoint) {
                                    if (searchers.length != 1) {
                                        // TODO
                                        throw new AssertionError();
                                    }
                                    nearestHits = LatLonPoint.nearest(searchers[0], "point",
                                            (lat + latEnd) / 2.0, (lon + lonEnd) / 2.0, nearestTopN).scoreDocs;
                                    if (false && iter == 0) {
                                        System.out.println("\n" + nearestHits.length + " nearest:");
                                        for (ScoreDoc hit : nearestHits) {
                                            System.out.println("  " + ((FieldDoc) hit).fields[0]);
                                        }
                                    }
                                    for (ScoreDoc hit : nearestHits) {
                                        totNearestDistance += (Double) ((FieldDoc) hit).fields[0];
                                    }
                                } else {
                                    throw new AssertionError();
                                }
                                break;
                            default:
                                throw new AssertionError("unknown queryClass " + queryClass);
                            }

                            // TODO: do this somewhere else?
                            if (filterPercent != null) {
                                BooleanQuery.Builder builder = new BooleanQuery.Builder();
                                builder.add(q, BooleanClause.Occur.MUST);
                                builder.add(new RandomQuery(filterPercent), BooleanClause.Occur.FILTER);
                                q = builder.build();
                            }

                            if (q != null) {
                                if (doDistanceSort) {
                                    Sort sort = new Sort(LatLonDocValuesField.newDistanceSort("point",
                                            centerLat, centerLon));
                                    for (IndexSearcher s : searchers) {
                                        TopFieldDocs hits = s.search(q, 10, sort);
                                        totHits += hits.totalHits;
                                    }
                                } else {
                                    //System.out.println("\nRUN QUERY " + q);
                                    //long t0 = System.nanoTime();
                                    for (IndexSearcher s : searchers) {
                                        int hitCount = s.count(q);
                                        totHits += hitCount;
                                        if (false && iter == 0) {
                                            System.out.println("q=" + q + " lat=" + centerLat + " lon="
                                                    + centerLon + " distanceMeters=" + distanceMeters
                                                    + " hits: " + hitCount);
                                        }
                                    }
                                }
                            } else {
                                assert nearestHits != null;
                                totHits += nearestHits.length;
                            }
                            queryCount++;
                            //throw new RuntimeException("now stop");
                        }
                    }
                }
            }

            long tEnd = System.nanoTime();
            double elapsedSec = (tEnd - tStart) / 1000000000.0;
            double qps = queryCount / elapsedSec;
            double mhps = (totHits / 1000000.0) / elapsedSec;
            if (queryClass.equals("nearest")) {
                System.out.println(String.format(Locale.ROOT,
                        "ITER %d: %.2f QPS (%.2f sec for %d queries), totNearestDistance=%.10f, totHits=%d",
                        iter, qps, elapsedSec, queryCount, totNearestDistance, maxDoc));
            } else {
                System.out.println(String.format(Locale.ROOT,
                        "ITER %d: %.2f M hits/sec, %.2f QPS (%.2f sec for %d queries), totHits=%d", iter, mhps,
                        qps, elapsedSec, queryCount, totHits));
            }
            if (qps > bestQPS) {
                System.out.println("  ***");
                bestQPS = qps;
                bestMHPS = mhps;
            }
        }
    }
    System.out.println("BEST M hits/sec: " + bestMHPS);
    System.out.println("BEST QPS: " + bestQPS);

    for (IndexSearcher s : searchers) {
        s.getIndexReader().close();
    }
    IOUtils.close(dirs);
}

From source file:perf.IndexAndSearchOpenStreetMapsGeo3D.java

License:Apache License

private static void queryIndex() throws IOException {
    Directory dir = FSDirectory.open(Paths.get("bkdtestgeo3d"));
    System.out.println("DIR: " + dir);
    IndexReader r = DirectoryReader.open(dir);
    System.out.println("maxDoc=" + r.maxDoc());
    IndexSearcher s = new IndexSearcher(r);
    //SegmentReader sr = (SegmentReader) r.leaves().get(0).reader();
    //BKDTreeReader reader = ((BKDTreeSortedNumericDocValues) sr.getSortedNumericDocValues("point")).getBKDTreeReader();

    //System.out.println("reader MB heap=" + (reader.ramBytesUsed()/1024/1024.));

    // London, UK:
    int STEPS = 5;
    double MIN_LAT = 51.0919106;
    double MAX_LAT = 51.6542719;
    double MIN_LON = -0.3867282;
    double MAX_LON = 0.8492337;
    for (int iter = 0; iter < 100; iter++) {
        long tStart = System.nanoTime();
        long totHits = 0;
        int queryCount = 0;
        for (int latStep = 0; latStep < STEPS; latStep++) {
            double lat = MIN_LAT + latStep * (MAX_LAT - MIN_LAT) / STEPS;
            for (int lonStep = 0; lonStep < STEPS; lonStep++) {
                double lon = MIN_LON + lonStep * (MAX_LON - MIN_LON) / STEPS;
                for (int latStepEnd = latStep + 1; latStepEnd <= STEPS; latStepEnd++) {
                    double latEnd = MIN_LAT + latStepEnd * (MAX_LAT - MIN_LAT) / STEPS;
                    for (int lonStepEnd = lonStep + 1; lonStepEnd <= STEPS; lonStepEnd++) {
                        double lonEnd = MIN_LON + lonStepEnd * (MAX_LON - MIN_LON) / STEPS;

                        Query q = new PointInGeo3DShapeQuery(PlanetModel.WGS84, "point",
                                GeoBBoxFactory.makeGeoBBox(PlanetModel.WGS84, toRadians(latEnd), toRadians(lat),
                                        toRadians(lon), toRadians(lonEnd)));
                        TotalHitCountCollector c = new TotalHitCountCollector();
                        //long t0 = System.nanoTime();
                        s.search(q, c);//from  w  w w  . j  a  va 2  s . co m

                        //System.out.println("\nITER: now query lat=" + lat + " latEnd=" + latEnd + " lon=" + lon + " lonEnd=" + lonEnd);
                        //Bits hits = reader.intersect(lat, latEnd, lon, lonEnd);
                        //System.out.println("  total hits: " + hitCount);
                        //totHits += ((FixedBitSet) hits).cardinality();
                        //System.out.println("  add tot " + c.getTotalHits());
                        totHits += c.getTotalHits();
                        queryCount++;
                    }
                }
            }
        }

        long tEnd = System.nanoTime();
        System.out.println("ITER: " + iter + " " + ((tEnd - tStart) / 1000000000.0) + " sec; totHits=" + totHits
                + "; " + queryCount + " queries");
    }

    IOUtils.close(r, dir);
}

From source file:perf.PKLookupPerfTest3X.java

License:Apache License

public static void main(String[] args) throws IOException {

    final Directory dir;
    final String dirImpl = args[0];
    final String dirPath = args[1];
    final int numDocs = Integer.parseInt(args[2]);
    final int numLookups = Integer.parseInt(args[3]);
    final long seed = Long.parseLong(args[4]);

    if (dirImpl.equals("MMapDirectory")) {
        dir = new MMapDirectory(new File(dirPath));
    } else if (dirImpl.equals("NIOFSDirectory")) {
        dir = new NIOFSDirectory(new File(dirPath));
    } else if (dirImpl.equals("SimpleFSDirectory")) {
        dir = new SimpleFSDirectory(new File(dirPath));
    } else {/*ww  w  .  j  a va 2 s .com*/
        throw new RuntimeException("unknown directory impl \"" + dirImpl + "\"");
    }

    if (!new File(dirPath).exists()) {
        createIndex(dir, numDocs);
    }

    final IndexReader r = IndexReader.open(dir);
    System.out.println("Reader=" + r);

    final IndexReader[] subs = r.getSequentialSubReaders();
    final TermDocs[] termDocsArr = new TermDocs[subs.length];
    for (int subIdx = 0; subIdx < subs.length; subIdx++) {
        termDocsArr[subIdx] = subs[subIdx].termDocs();
    }

    final int maxDoc = r.maxDoc();
    final Random rand = new Random(seed);

    for (int cycle = 0; cycle < 10; cycle++) {
        System.out.println("Cycle: " + (cycle == 0 ? "warm" : "test"));
        System.out.println("  Lookup...");
        final Term[] lookup = new Term[numLookups];
        final int[] docIDs = new int[numLookups];
        final Term protoTerm = new Term("id");
        for (int iter = 0; iter < numLookups; iter++) {
            // Base 36, prefixed with 0s to be length 6 (= 2.2 B)
            lookup[iter] = protoTerm.createTerm(
                    String.format("%6s", Integer.toString(rand.nextInt(maxDoc), Character.MAX_RADIX))
                            .replace(' ', '0'));
        }
        Arrays.fill(docIDs, -1);

        final AtomicBoolean failed = new AtomicBoolean(false);

        final Term t = new Term("id", "");

        final long tStart = System.currentTimeMillis();
        for (int iter = 0; iter < numLookups; iter++) {
            //System.out.println("lookup " + lookup[iter].utf8ToString());
            int base = 0;
            int found = 0;
            for (int subIdx = 0; subIdx < subs.length; subIdx++) {
                final IndexReader sub = subs[subIdx];
                if (!DO_DOC_LOOKUP) {
                    final int df = sub.docFreq(lookup[iter]);
                    if (df != 0) {
                        if (df != 1) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            System.out.println("FAIL0");
                            failed.set(true);
                        }
                    }
                } else {
                    final TermDocs termDocs = termDocsArr[subIdx];
                    termDocs.seek(lookup[iter]);
                    if (termDocs.next()) {
                        found++;
                        if (found > 1) {
                            // Should have been found only once across segs
                            failed.set(true);
                        }
                        final int docID = termDocs.doc();
                        if (docIDs[iter] != -1) {
                            // Same doc should only be seen once
                            failed.set(true);
                        }
                        docIDs[iter] = base + docID;
                        if (termDocs.next()) {
                            // Only 1 doc should be found
                            failed.set(true);
                        }
                    }
                }
                base += sub.maxDoc();
            }
        }
        final long tLookup = (System.currentTimeMillis() - tStart);

        // cycle 0 is for warming
        //System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups + " lookups (" + (1000*tLookup/numLookups) + " us per lookup) + totSeekMS=" + (BlockTermsReader.totSeekNanos/1000000.));
        System.out.println("  " + (cycle == 0 ? "WARM: " : "") + tLookup + " msec for " + numLookups
                + " lookups (" + (1000.0 * tLookup / numLookups) + " us per lookup)");

        if (failed.get()) {
            throw new RuntimeException("at least one lookup produced more than one result");
        }

        if (DO_DOC_LOOKUP) {
            System.out.println("  Verify...");
            for (int iter = 0; iter < numLookups; iter++) {
                if (docIDs[iter] == -1) {
                    throw new RuntimeException("lookup of " + lookup[iter] + " failed iter=" + iter);
                }
                final String found = r.document(docIDs[iter]).get("id");
                if (!found.equals(lookup[iter].text())) {
                    throw new RuntimeException(
                            "lookup of docid=" + lookup[iter].text() + " hit wrong docid=" + found);
                }
            }
        }
    }

    // System.out.println("blocks=" + BlockTermsReader.totBlockReadCount + " scans=" + BlockTermsReader.totScanCount + " " + (((float) BlockTermsReader.totScanCount))/(BlockTermsReader.totBlockReadCount) + " scans/block");

    r.close();
    dir.close();
}

From source file:perf.RandomFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(IndexReader reader) {
    final Random rand = new Random(42);
    final int maxDoc = reader.maxDoc();
    OpenBitSet bits = new OpenBitSet(maxDoc);
    for (int docID = 0; docID < maxDoc; docID++) {
        if (rand.nextDouble() <= pctKeep) {
            bits.fastSet(docID);//  w w w.  j a  v  a2s  .c  o m
        }
    }

    System.out.println("rfilt " + bits.cardinality());
    return bits;
}

From source file:pretraga.IsolationSimilarity.java

public void test(String vec) {
    List<String> vector = processInput(vec);
    HashMap<String, Long> map = new HashMap<>();
    try {/*from  www  .  j  av  a  2s.  c  o m*/
        Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath());

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);

        List<Integer> docId = getDocumentsFromVector(vector, reader, searcher);

        for (int i = 0; i < docId.size(); i++) {
            Fields ff = reader.getTermVectors(docId.get(i));
            Terms terms = ff.terms(CONTENT);

            TermsEnum te = terms.iterator();
            Object tmp = te.next();
            while (tmp != null) {
                BytesRef by = (BytesRef) tmp;
                String term = by.utf8ToString();

                ClassicSimilarity sim = null;
                if (searcher.getSimilarity(true) instanceof ClassicSimilarity) {
                    sim = (ClassicSimilarity) searcher.getSimilarity(true);
                }
                float idf = sim.idf(te.docFreq(), reader.maxDoc());
                float tf = sim.tf(te.totalTermFreq());
                //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq());
                TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq());
                CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(),
                        terms.getSumTotalTermFreq(), terms.getSumDocFreq());
                Document d = reader.document(docId.get(i));
                if (vector.contains(term)) {
                    float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0))
                            .score(docId.get(i), te.totalTermFreq());
                    System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term);
                }
                tmp = te.next();
            }

            /*Iterator<String> ss = ff.iterator();
            while (ss.hasNext()) {
            String fieldString = ss.next();
            System.out.println(fieldString);
            }*/
        }
    } catch (Exception e) {

    }
}

From source file:proj.zoie.api.ZoieSegmentReader.java

License:Apache License

private void init(IndexReader reader) throws IOException {
    int maxDoc = reader.maxDoc();
    _uidArray = new long[maxDoc];
    TermPositions tp = null;/* w  ww. j a va  2  s  .c o  m*/
    byte[] payloadBuffer = new byte[8]; // four bytes for a long
    try {
        tp = reader.termPositions(UID_TERM);
        int idx = 0;
        while (tp.next()) {
            int doc = tp.doc();
            assert doc < maxDoc;

            while (idx < doc)
                _uidArray[idx++] = DELETED_UID; // fill the gap

            tp.nextPosition();
            tp.getPayload(payloadBuffer, 0);
            long uid = bytesToLong(payloadBuffer);
            if (uid < _minUID)
                _minUID = uid;
            if (uid > _maxUID)
                _maxUID = uid;
            _uidArray[idx++] = uid;
        }
        while (idx < maxDoc)
            _uidArray[idx++] = DELETED_UID; // fill the gap
    } finally {
        if (tp != null) {
            tp.close();
        }
    }
}

From source file:psidev.psi.mi.search.engine.impl.AbstractSearchEngine.java

License:Apache License

public SearchResult<T> searchAll(Integer firstResult, Integer maxResults) throws SearchEngineException {
    if (firstResult == null)
        firstResult = 0;// w ww .j  a  v a  2s . co m
    if (maxResults == null)
        maxResults = MAX_TOP_RESULTS;

    IndexReader reader = indexSearcher.getIndexReader();

    int totalCount = reader.maxDoc();

    if (maxResults == 0) {
        return new SearchResult(Collections.EMPTY_LIST, totalCount, firstResult, maxResults,
                new WildcardQuery(new Term("", "*")));
    }

    // this is a hack to ignore any header introduced in the index by mistake (first development versions)
    if (reader.isDeleted(0)) {
        firstResult++;
        totalCount--;
    }

    if (firstResult > totalCount) {
        //            closeIndexReader(reader);
        return new SearchResult(Collections.EMPTY_LIST, totalCount, firstResult, maxResults,
                new WildcardQuery(new Term("", "*")));
    }

    int maxIndex = Math.min(totalCount, firstResult + maxResults);

    List<T> dataObjects = new ArrayList<T>();

    for (int i = firstResult; i < maxIndex; i++) {
        try {
            Document doc = reader.document(i);
            T data = (T) createDocumentBuilder().createData(doc);
            dataObjects.add(data);
        } catch (Exception e) {
            //                closeIndexReader(reader);
            throw new SearchEngineException(e);
        }
    }

    //        closeIndexReader(reader);
    return new SearchResult(dataObjects, totalCount, firstResult, maxResults,
            new WildcardQuery(new Term("", "*")));
}

From source file:solr2155.lucene.spatial.geohash.GeoHashPrefixFilter.java

License:Apache License

@Override
public DocIdSet getDocIdSet(IndexReader reader) throws IOException {
    final OpenBitSet bits = new OpenBitSet(reader.maxDoc());
    final TermsEnumCompatibility termsEnum = new TermsEnumCompatibility(reader, fieldName);//Lucene 4 compatibility wrapper
    final TermDocs termDocs = reader.termDocs();
    Term term = termsEnum.term();//the most recent term examined via termsEnum.term()
    if (term == null)
        return bits;

    //TODO Add a precision short-circuit so that we are not accurate on the edge but we're faster.

    //TODO An array based nodes impl would be more efficient; or a stack of iterators.  LinkedList conveniently has bulk add to beginning.
    LinkedList<GridNode> nodes = new LinkedList<GridNode>(
            gridReferenceSystem.getSubNodes(geoShape.boundingRectangle()));
    while (!nodes.isEmpty() && term != null) {
        final GridNode node = nodes.removeFirst();
        assert node.length() > 0;
        if (!node.contains(term.text()) && node.before(term.text()))
            continue;//short circuit, moving >= the next indexed term
        IntersectCase intersection = geoShape.intersect(node.getRectangle());
        if (intersection == IntersectCase.OUTSIDE)
            continue;
        TermsEnumCompatibility.SeekStatus seekStat = termsEnum.seek(node.getTermVal());
        term = termsEnum.term();/*from  w w  w . j  a  v a2 s . c o m*/
        if (seekStat != TermsEnumCompatibility.SeekStatus.FOUND)
            continue;
        if (intersection == IntersectCase.CONTAINS) {
            termDocs.seek(term);
            addDocs(termDocs, bits);
            term = termsEnum.next();//move to next term
        } else {//any other intersection
            //TODO is it worth it to optimize the shape (e.g. potentially simpler polygon)?
            //GeoShape geoShape = this.geoShape.optimize(intersection);

            //We either scan through the leaf node(s), or if there are many points then we divide & conquer.
            boolean manyPoints = node.length() < gridReferenceSystem.maxLen - GRIDLEN_SCAN_THRESHOLD;

            //TODO Try variable depth strategy:
            //IF configured to do so, we could use term.freq() as an estimate on the number of places at this depth.  OR, perhaps
            //  make estimates based on the total known term count at this level?  Or don't worry about it--use fixed depth.
            //        if (manyPoints) {
            //          //Make some estimations on how many points there are at this level and how few there would need to be to set
            //          // manyPoints to false.
            //
            //          long termsThreshold = (long) estimateNumberIndexedTerms(node.length(),geoShape.getDocFreqExpenseThreshold(node));
            //
            //          long thisOrd = termsEnum.ord();
            //          manyPoints = (termsEnum.seek(thisOrd+termsThreshold+1) != TermsEnum.SeekStatus.END
            //                  && node.contains(termsEnum.term()));
            //          termsEnum.seek(thisOrd);//return to last position
            //        }

            if (!manyPoints) {
                //traverse all leaf terms within this node to see if they are within the geoShape, one by one.
                for (; term != null && node.contains(term.text()); term = termsEnum.next()) {
                    if (term.text().length() < gridReferenceSystem.maxLen)//not a leaf
                        continue;
                    final Point2D point = gridReferenceSystem.decodeXY(term.text());
                    //Filter those out of the shape.
                    if (!geoShape.contains(point))
                        continue;

                    //record
                    termDocs.seek(term);
                    addDocs(termDocs, bits);
                }
            } else {
                //divide & conquer
                nodes.addAll(0, node.getSubNodes());//add to beginning
            }
        }
    } //node loop

    return bits;
}

From source file:solr2155.solr.search.function.GeoHashValueSource.java

License:Apache License

@SuppressWarnings({ "unchecked" })
GeoHashValueSource(String fieldName, SolrIndexSearcher searcher) throws IOException {
    log.info("Loading geohash field " + fieldName + " into memory.");
    this.fieldName = fieldName;

    //Get gridReferenceSystem
    final GridNode.GridReferenceSystem gridReferenceSystem;
    FieldType fieldType = searcher.getSchema().getField(fieldName).getType();
    if (fieldType instanceof GeoHashField) {
        gridReferenceSystem = ((GeoHashField) fieldType).getGridReferenceSystem();
    } else//from  www  . j  a v  a2 s.  co m
        throw new RuntimeException(
                "field " + fieldName + " should be a GeoHashField, not " + fieldType.getTypeName());

    //Traverse the index to load up doc2PointsCache
    IndexReader reader = searcher.getIndexReader();
    TermsEnumCompatibility termsEnum = new TermsEnumCompatibility(reader, fieldName);
    TermDocs termDocs = reader.termDocs(); //cached for termsEnum.docs() calls
    try {
        while (true) {
            final Term term = termsEnum.next();
            if (term == null)
                break;
            if (term.text().length() != gridReferenceSystem.getPrecision())
                continue;
            Point2D point = gridReferenceSystem.decodeXY(term.text());
            termDocs.seek(termsEnum.getTermEnum());
            while (termDocs.next()) {
                final int docId = termDocs.doc();
                if (docId == DocIdSetIterator.NO_MORE_DOCS)
                    break;
                if (doc2PointsCache == null)
                    doc2PointsCache = (List<Point2D>[]) new List[reader.maxDoc()];//java generics hack
                List<Point2D> points = doc2PointsCache[docId];
                if (points == null) {
                    points = new ArrayList<Point2D>(DEFAULT_ARRAY_CAPACITY);
                    doc2PointsCache[docId] = points;
                }
                points.add(point);
            }
        }
    } finally { // in Lucene 3 these should be closed (not in Lucene 4)
        termDocs.close();
        termsEnum.close();
    }

    //Log statistics
    if (log.isInfoEnabled()) {
        int min = Integer.MAX_VALUE, sum = 0, max = 0;
        int dlen = 0;
        if (doc2PointsCache != null) {
            dlen = doc2PointsCache.length;
            for (List<Point2D> point2Ds : doc2PointsCache) {
                int plen = (point2Ds == null ? 0 : point2Ds.size());
                min = Math.min(min, plen);
                max = Math.max(max, plen);
                sum += plen;
            }
        }
        if (min == Integer.MAX_VALUE)
            min = 0;
        float avg = (float) sum / dlen;
        log.info("field '" + fieldName + "' in RAM: loaded min/avg/max per doc #: (" + min + "," + avg + ","
                + max + ") #" + dlen);
    }
}