Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:com.browseengine.bobo.facets.data.MultiValueWithWeightFacetDataCache.java

License:Apache License

/**
 * loads multi-value facet data. This method uses a workarea to prepare loading.
 * @param fieldName/*from w  w  w .j a  v a  2  s  .  c o m*/
 * @param reader
 * @param listFactory
 * @param workArea
 * @throws IOException
 */
public void load(String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea)
        throws IOException {
    long t0 = System.currentTimeMillis();
    int maxdoc = reader.maxDoc();
    BufferedLoader loader = getBufferedLoader(maxdoc, workArea);
    BufferedLoader weightLoader = getBufferedLoader(maxdoc, null);

    TermEnum tenum = null;
    TermDocs tdoc = null;
    TermValueList<T> list = (listFactory == null ? (TermValueList<T>) new TermStringList()
            : listFactory.createTermList());
    IntArrayList minIDList = new IntArrayList();
    IntArrayList maxIDList = new IntArrayList();
    IntArrayList freqList = new IntArrayList();
    OpenBitSet bitset = new OpenBitSet(maxdoc + 1);
    int negativeValueCount = getNegativeValueCount(reader, fieldName.intern());
    int t = 0; // current term number
    list.add(null);
    minIDList.add(-1);
    maxIDList.add(-1);
    freqList.add(0);
    t++;

    _overflow = false;

    String pre = null;

    int df = 0;
    int minID = -1;
    int maxID = -1;
    int valId = 0;

    try {
        tdoc = reader.termDocs();
        tenum = reader.terms(new Term(fieldName, ""));
        if (tenum != null) {
            do {
                Term term = tenum.term();
                if (term == null || !fieldName.equals(term.field()))
                    break;

                String val = term.text();

                if (val != null) {
                    int weight = 0;
                    String[] split = val.split("\u0000");
                    if (split.length > 1) {
                        val = split[0];
                        weight = Integer.parseInt(split[split.length - 1]);
                    }
                    if (pre == null || !val.equals(pre)) {
                        if (pre != null) {
                            freqList.add(df);
                            minIDList.add(minID);
                            maxIDList.add(maxID);
                        }

                        list.add(val);

                        df = 0;
                        minID = -1;
                        maxID = -1;
                        valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t;
                        t++;
                    }

                    tdoc.seek(tenum);
                    if (tdoc.next()) {
                        df++;
                        int docid = tdoc.doc();

                        if (!loader.add(docid, valId))
                            logOverflow(fieldName);
                        else
                            weightLoader.add(docid, weight);

                        if (docid < minID)
                            minID = docid;
                        bitset.fastSet(docid);
                        while (tdoc.next()) {
                            df++;
                            docid = tdoc.doc();

                            if (!loader.add(docid, valId))
                                logOverflow(fieldName);
                            else
                                weightLoader.add(docid, weight);

                            bitset.fastSet(docid);
                        }
                        if (docid > maxID)
                            maxID = docid;
                    }
                    pre = val;
                }

            } while (tenum.next());
            if (pre != null) {
                freqList.add(df);
                minIDList.add(minID);
                maxIDList.add(maxID);
            }
        }
    } finally {
        try {
            if (tdoc != null) {
                tdoc.close();
            }
        } finally {
            if (tenum != null) {
                tenum.close();
            }
        }
    }

    list.seal();

    try {
        _nestedArray.load(maxdoc + 1, loader);
        _weightArray.load(maxdoc + 1, weightLoader);
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException("failed to load due to " + e.toString(), e);
    }

    this.valArray = list;
    this.freqs = freqList.toIntArray();
    this.minIDs = minIDList.toIntArray();
    this.maxIDs = maxIDList.toIntArray();

    int doc = 0;
    while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) {
        ++doc;
    }
    if (doc <= maxdoc) {
        this.minIDs[0] = doc;
        doc = maxdoc;
        while (doc > 0 && !_nestedArray.contains(doc, 0, true)) {
            --doc;
        }
        if (doc > 0) {
            this.maxIDs[0] = doc;
        }
    }
    this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality();
}

From source file:com.browseengine.bobo.search.section.IntMetaDataCache.java

License:Apache License

public IntMetaDataCache(Term term, IndexReader reader) throws IOException {
    _reader = reader;//from  w w w  . j a v  a  2 s.  c o m

    int maxDoc = reader.maxDoc();
    _list = new int[(maxDoc + MAX_SLOTS - 1) / MAX_SLOTS][];
    _curPageNo = 0;
    _curSlot = 0;
    _curData = MAX_SLOTS;

    if (maxDoc > 0) {
        _curPage = new int[MAX_SLOTS * 2];
        loadPayload(term);
    }

    _curPage = null;
}

From source file:com.browseengine.bobo.test.BoboTestCase.java

License:Open Source License

private Directory createIndex() {
    RAMDirectory idxDir = new RAMDirectory();

    try {/*from  ww w  .  j ava  2 s.  c o m*/
        Document[] data = buildData();

        TestDataDigester testDigester = new TestDataDigester(_fconf, data);
        BoboIndexer indexer = new BoboIndexer(testDigester, idxDir);
        indexer.index();
        IndexReader r = IndexReader.open(idxDir, false);
        r.deleteDocument(r.maxDoc() - 1);
        //r.flush();
        r.close();
    } catch (UnsupportedEncodingException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    return idxDir;

}

From source file:com.browseengine.bobo.tools.CarDataDigest.java

License:Open Source License

public void digest(DataHandler handler) throws IOException {
    int numcars = getMaxDocs();
    Random rand = new Random();

    IndexReader reader = null;
    try {/*from  ww  w.  ja  va 2 s .  co m*/
        reader = IndexReader.open(FSDirectory.open(getDataFile()), true);
        int carcount = reader.maxDoc();

        Document[] docCache = new Document[carcount];
        for (int i = 0; i < carcount; ++i) {
            docCache[i] = reader.document(i);
        }

        for (int i = 0; i < numcars; ++i) {
            if (i != 0 && i % 1000 == 0) {
                System.out.println(i + " cars indexed.");
            }
            Document doc = new Document();
            int n = rand.nextInt(10);
            if (n == 0) {
                makeCar(doc, cars[rand.nextInt(cars.length)]);
            } else {
                Document srcDoc = docCache[rand.nextInt(carcount)];
                makeCar(doc, srcDoc);
            }

            populateDocument(doc, null);
            handler.handleDocument(doc);
        }
    } finally {
        if (reader != null) {
            reader.close();
        }
    }
}

From source file:com.browseengine.local.glue.GeoSearchFilter.java

License:Open Source License

/**
 * Broken at the poles./*  ww w.j  a va 2  s  . c o m*/
 * 
 * NOTE that this isn't actually precise, since it means the result 
 * fits in the box, not in the hypercircle.  notably, for all 
 * valid results, bits.get(i) is true.  however, bits.get(i) might 
 * be inside the box but ouside the radius of the search.
 * 
 * The ratio of correct answers to incorrect answers, if taken on 
 * a flat plane assuming a circle rather than elipse, is somewhere 
 * around PI*r^2/(2*r)^2 = PI/4 = 0.785.
 * 
 * We could also optionally compute a bit set within whose bounds 
 * bits.get(i) implies it's a result, but !bits.get(i) doesn't 
 * tell us if it's a result or not.  
 * 
 * However, the former gives us the option of refinement at scoring 
 * time, in particular if the user has chosen to sort by distance.  
 *
 * But hit counts and appearance or disappearance of results 
 * during browse might lead to confusion, and necessitate actual 
 * result set inclusion at this step.  If this is the case, we 
 * can use the inner and outer bounds as rules to only actually 
 * compute distance here if it is between the inner box and outer 
 * box.
 */
public BitSet makeBitSet(IndexReader reader) throws IOException {
    if (_rangeInMiles < 0f) {
        // all bits on by default
        int maxDoc = reader.maxDoc();
        BitSet bits = new BitSet(maxDoc);
        bits.set(0, maxDoc);
        return bits;
    }
    return makeBitSetFast(reader);
}

From source file:com.browseengine.local.glue.GeoSearchFilter.java

License:Open Source License

/**
 * The fastest way to make the bit set.  The rule is that iff it is 
 * a possible candidate for being a result, it is set to true.  
 * This means that it just has to be inside the outer bounding "box" 
 * created by the min/max lon/lat values that are possible as 
 * resutls.  Hence the returned bit set represents a set that 
 * contains every result, but some of whom may not be within the 
 * true distance specified (we estimate this represents on 
 * average less than 22% of the total set size).
 * /*from w w  w .ja  v a  2  s  .com*/
 * @param reader
 * @return
 * @throws IOException
 */
public BitSet makeBitSetFast(IndexReader reader) throws IOException {
    int maxDoc = reader.maxDoc();
    BitSet bits = new BitSet(maxDoc);
    Locatable centroid = LonLat.getLonLatDeg(_lonDegrees, _latDegrees);

    // outer box only
    int[] bounds = HaversineWrapper.computeLonLatMinMaxAsInt(centroid, _rangeInMiles);
    int minLon = bounds[HaversineWrapper.LON_MIN];
    int maxLon = bounds[HaversineWrapper.LON_MAX];
    int minLat = bounds[HaversineWrapper.LAT_MIN];
    int maxLat = bounds[HaversineWrapper.LAT_MAX];
    int lonAsInt;
    int latAsInt;
    for (int i = 0; i < maxDoc; i++) {
        lonAsInt = _lonLats.lons[i];
        latAsInt = _lonLats.lats[i];
        if (lonAsInt >= minLon && lonAsInt <= maxLon && latAsInt >= minLat && latAsInt <= maxLat) {
            bits.set(i);
        }
    }
    return bits;
}

From source file:com.browseengine.local.glue.GeoSearchFilter.java

License:Open Source License

/**
 * Broken at the poles.  /*from  ww w.j a  va 2 s.  c  om*/
 * 
 * A more accurate representation of the result set, 
 * computed by using actual distance measures for everything outside an 
 * inner bounding box, but inside the outer bounding box.  
 * The improved accuracy comes at a performance hit when compared to 
 * {@link #makeBitSetFast(IndexReader).
 * 
 * The inaccuracies 
 * would come from an incorrect computation of the inner bounding box 
 * (this should be improved upon if there's time--maybe just make it 
 * a little smaller for added computation cost?).
 * 
 * @param reader
 * @return
 * @throws IOException
 */
public BitSet makeBitSetMoreAccurate(IndexReader reader) throws IOException {
    int maxDoc = reader.maxDoc();
    BitSet bits = new BitSet(maxDoc);
    Locatable centroid = LonLat.getLonLatDeg(_lonDegrees, _latDegrees);

    // outer box
    int[] bounds = HaversineWrapper.computeLonLatMinMaxAsInt(centroid, _rangeInMiles);
    int minLon = bounds[HaversineWrapper.LON_MIN];
    int maxLon = bounds[HaversineWrapper.LON_MAX];
    int minLat = bounds[HaversineWrapper.LAT_MIN];
    int maxLat = bounds[HaversineWrapper.LAT_MAX];

    // inner box approximation, test all outside inner box
    int lonSpread = maxLon - minLon;
    lonSpread = (int) (Math.round(lonSpread / SQRT_TWO) / 2);
    int latSpread = maxLat - minLat;
    latSpread = (int) (Math.round(latSpread / SQRT_TWO) / 2);
    int lonAsInt = GeoSearchFields.dubToInt(_lonDegrees);
    int latAsInt = GeoSearchFields.dubToInt(_latDegrees);
    int iminLon = lonAsInt - lonSpread;
    int imaxLon = lonAsInt + lonSpread;
    int iminLat = latAsInt - latSpread;
    int imaxLat = latAsInt + latSpread;

    double centerLonRad = centroid.getLongitudeRad();
    double centerLatRad = centroid.getLatitudeRad();

    for (int i = 0; i < maxDoc; i++) {
        lonAsInt = _lonLats.lons[i];
        latAsInt = _lonLats.lats[i];
        if (lonAsInt >= minLon && lonAsInt <= maxLon && latAsInt >= minLat && latAsInt <= maxLat) {
            if (lonAsInt >= iminLon && lonAsInt <= imaxLon && latAsInt >= iminLat && latAsInt <= imaxLat) {
                bits.set(i);
            } else if (HaversineWrapper.computeHaversineDistanceMiles(centerLonRad, centerLatRad, lonAsInt,
                    latAsInt) <= _rangeInMiles) {
                bits.set(i);
            }
        }
    }
    return bits;

}

From source file:com.browseengine.local.glue.GeoSearchFilter.java

License:Open Source License

/**
 * Broken at the poles.//from w  w w .j  a v  a 2  s. c o  m
 * 
 * Otherwise, this is an accurate representation of the true result set, but runs slower than 
 * {@link #makeBitSetMoreAccurate(IndexReader)}.  It computes the actual distance for every 
 * member in the set, and includes it iff it is within the bounds.
 * 
 * @param reader
 * @return
 * @throws IOException
 */
public BitSet makeBitSetCompletelyAccurate(IndexReader reader) throws IOException {
    int maxDoc = reader.maxDoc();
    BitSet bits = new BitSet(maxDoc);
    Locatable centroid = LonLat.getLonLatDeg(_lonDegrees, _latDegrees);

    // outer box only
    int[] bounds = HaversineWrapper.computeLonLatMinMaxAsInt(centroid, _rangeInMiles);
    int minLon = bounds[HaversineWrapper.LON_MIN];
    int maxLon = bounds[HaversineWrapper.LON_MAX];
    int minLat = bounds[HaversineWrapper.LAT_MIN];
    int maxLat = bounds[HaversineWrapper.LAT_MAX];
    int lonAsInt;
    int latAsInt;

    double centerLonRad = centroid.getLongitudeRad();
    double centerLatRad = centroid.getLatitudeRad();

    for (int i = 0; i < maxDoc; i++) {
        lonAsInt = _lonLats.lons[i];
        latAsInt = _lonLats.lats[i];
        if (lonAsInt >= minLon && lonAsInt <= maxLon && latAsInt >= minLat && latAsInt <= maxLat) {
            if (HaversineWrapper.computeHaversineDistanceMiles(centerLonRad, centerLatRad, lonAsInt,
                    latAsInt) <= _rangeInMiles) {
                bits.set(i);
            }
        }
    }
    return bits;

}

From source file:com.browseengine.local.service.geoindex.GeoResourceWriter.java

License:Open Source License

public synchronized void optimize() throws IOException, GeoIndexingException {
    if (_writer != null) {
        if (_path != null) {
            _writer.optimize();/* w  ww .j a  v a2  s.  c  o  m*/
            File path2 = new File(_path.getParentFile(), _path.getName() + ".tmp");
            _writer.close();
            _writer = null;
            if (_path.renameTo(path2)) {
                IndexReader reader = null;
                TermEnum termEnum = null;
                TermDocs termDocs = null;
                try {
                    reader = IndexReader.open(path2);
                    int maxDoc = reader.maxDoc();
                    if (maxDoc <= 0) {
                        throw new GeoIndexingException("can't optimize an index with " + maxDoc + " docs");
                    }
                    LonDocid[] lonDocids = new LonDocid[maxDoc];
                    String fld = GeoSearchFields.LON.getField().intern();
                    Term term = new Term(fld, "");
                    termEnum = reader.terms(term);
                    termDocs = reader.termDocs();
                    while ((term = termEnum.term()) != null && term.field() == fld) {
                        double lon = Double.parseDouble(term.text());
                        termDocs.seek(term);
                        while (termDocs.next()) {
                            int docid = termDocs.doc();
                            lonDocids[docid] = new LonDocid(docid, lon);
                        }
                        termEnum.next();
                    }
                    termDocs.close();
                    termDocs = null;
                    termEnum.close();
                    termEnum = null;
                    Arrays.sort(lonDocids);
                    init(_path, true);
                    for (int i = 0; i < lonDocids.length; i++) {
                        int docid = lonDocids[i].docid;
                        Document doc = reader.document(docid);
                        // all fields are stored
                        String name = doc.get(GeoSearchFields.NAME.getField());
                        String description = doc.get(GeoSearchFields.DESCRIPTION.getField());
                        String addressStr = doc.get(GeoSearchFields.ADDRESS.getField());
                        String phoneStr = doc.get(GeoSearchFields.PHONE.getField());
                        long phoneNumber = LocalResource.NO_PHONE_NUMBER;
                        if (phoneStr != null && phoneStr.length() > 0) {
                            phoneNumber = Long.parseLong(phoneStr);
                        }
                        String lonStr = doc.get(GeoSearchFields.LON.getField());
                        double lon = Double.parseDouble(lonStr);
                        String latStr = doc.get(GeoSearchFields.LAT.getField());
                        double lat = Double.parseDouble(latStr);

                        LocalResource resource = new LocalResource(name, description, addressStr, phoneNumber,
                                lon, lat);
                        addResource(resource);
                    }
                    reader.close();
                    reader = null;

                    _writer.optimize();

                    LOGGER.info("successfully completed optimization of index at " + _path.getAbsolutePath());
                } finally {
                    try {
                        // erase the tmp dir
                        recursiveDelete(path2);
                    } finally {
                        try {
                            if (reader != null) {
                                reader.close();
                            }
                        } finally {
                            try {
                                if (termEnum != null) {
                                    termEnum.close();
                                }
                            } finally {
                                try {
                                    if (termDocs != null) {
                                        termDocs.close();
                                    }
                                } finally {
                                    reader = null;
                                    termDocs = null;
                                    termEnum = null;
                                }
                            }
                        }
                    }
                }
            } else {
                init(_path, false);
                throw new GeoIndexingException("trouble doing the rename from " + _path.getAbsolutePath()
                        + " to " + path2.getAbsolutePath() + "; check permissions");
            }
        } else {
            _writer.optimize();
        }
    } else {
        throw new GeoIndexingException("attempt to optimize a closed " + GeoResourceWriter.class.getName());
    }
}

From source file:com.browseengine.local.service.geosearch.GeoSearchImpl.java

License:Open Source License

/**
 * lon and lat are stored in the index as 7-decimal place precision values of 
 * degrees.//from  w  w  w. ja  v a  2s  .  c  om
 * 2^32 = 4,294,967,296 values.  signed gives us -2^31 to 2^31-1.
 * if we multiply the normal value 179.123456 by 10^6, we get 179,123,456, 
 * which fits in this space.
 * if we multiply the normal value 179.1234567 by 10^7, we get 1,791,234,567, 
 * which fits in this space.
 * @throws IOException
 * @throws GeoSearchingException 
 */
public static int[] loadDegreeFieldIntoInt(IndexReader reader, String fld)
        throws IOException, GeoSearchingException {
    int[] vals = new int[reader.maxDoc()];
    TermEnum termEnum = null;
    TermDocs termDocs = null;
    try {
        fld = fld.intern();
        Term term = new Term(fld, "");
        termEnum = reader.terms(term);
        termDocs = reader.termDocs();
        do {
            term = termEnum.term();
            if (null == term || term.field() != fld) {
                break;
            }
            termDocs.seek(term);
            int numAdded = 0;
            while (termDocs.next()) {
                String str = term.text();
                double dub = Double.parseDouble(str);
                vals[termDocs.doc()] = GeoSearchFields.dubToInt(dub);
                numAdded++;
            }
            if (numAdded <= 0) {
                throw new GeoSearchingException(
                        "data integrity problem in field " + fld + ", term " + term.text());
            }
        } while (termEnum.next());

        return vals;
    } catch (NumberFormatException nfe) {
        throw new GeoSearchingException(
                "data integrity problem, non-numeric field value for field " + fld + ": " + nfe, nfe);
    } finally {
        try {
            if (termDocs != null) {
                termDocs.close();
            }
        } finally {
            try {
                if (termEnum != null) {
                    termEnum.close();
                }
            } finally {
                //
            }
        }
    }
}