List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:com.browseengine.bobo.facets.data.MultiValueWithWeightFacetDataCache.java
License:Apache License
/** * loads multi-value facet data. This method uses a workarea to prepare loading. * @param fieldName/*from w w w .j a v a 2 s . c o m*/ * @param reader * @param listFactory * @param workArea * @throws IOException */ public void load(String fieldName, IndexReader reader, TermListFactory<T> listFactory, WorkArea workArea) throws IOException { long t0 = System.currentTimeMillis(); int maxdoc = reader.maxDoc(); BufferedLoader loader = getBufferedLoader(maxdoc, workArea); BufferedLoader weightLoader = getBufferedLoader(maxdoc, null); TermEnum tenum = null; TermDocs tdoc = null; TermValueList<T> list = (listFactory == null ? (TermValueList<T>) new TermStringList() : listFactory.createTermList()); IntArrayList minIDList = new IntArrayList(); IntArrayList maxIDList = new IntArrayList(); IntArrayList freqList = new IntArrayList(); OpenBitSet bitset = new OpenBitSet(maxdoc + 1); int negativeValueCount = getNegativeValueCount(reader, fieldName.intern()); int t = 0; // current term number list.add(null); minIDList.add(-1); maxIDList.add(-1); freqList.add(0); t++; _overflow = false; String pre = null; int df = 0; int minID = -1; int maxID = -1; int valId = 0; try { tdoc = reader.termDocs(); tenum = reader.terms(new Term(fieldName, "")); if (tenum != null) { do { Term term = tenum.term(); if (term == null || !fieldName.equals(term.field())) break; String val = term.text(); if (val != null) { int weight = 0; String[] split = val.split("\u0000"); if (split.length > 1) { val = split[0]; weight = Integer.parseInt(split[split.length - 1]); } if (pre == null || !val.equals(pre)) { if (pre != null) { freqList.add(df); minIDList.add(minID); maxIDList.add(maxID); } list.add(val); df = 0; minID = -1; maxID = -1; valId = (t - 1 < negativeValueCount) ? (negativeValueCount - t + 1) : t; t++; } tdoc.seek(tenum); if (tdoc.next()) { df++; int docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); else weightLoader.add(docid, weight); if (docid < minID) minID = docid; bitset.fastSet(docid); while (tdoc.next()) { df++; docid = tdoc.doc(); if (!loader.add(docid, valId)) logOverflow(fieldName); else weightLoader.add(docid, weight); bitset.fastSet(docid); } if (docid > maxID) maxID = docid; } pre = val; } } while (tenum.next()); if (pre != null) { freqList.add(df); minIDList.add(minID); maxIDList.add(maxID); } } } finally { try { if (tdoc != null) { tdoc.close(); } } finally { if (tenum != null) { tenum.close(); } } } list.seal(); try { _nestedArray.load(maxdoc + 1, loader); _weightArray.load(maxdoc + 1, weightLoader); } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException("failed to load due to " + e.toString(), e); } this.valArray = list; this.freqs = freqList.toIntArray(); this.minIDs = minIDList.toIntArray(); this.maxIDs = maxIDList.toIntArray(); int doc = 0; while (doc <= maxdoc && !_nestedArray.contains(doc, 0, true)) { ++doc; } if (doc <= maxdoc) { this.minIDs[0] = doc; doc = maxdoc; while (doc > 0 && !_nestedArray.contains(doc, 0, true)) { --doc; } if (doc > 0) { this.maxIDs[0] = doc; } } this.freqs[0] = maxdoc + 1 - (int) bitset.cardinality(); }
From source file:com.browseengine.bobo.search.section.IntMetaDataCache.java
License:Apache License
public IntMetaDataCache(Term term, IndexReader reader) throws IOException { _reader = reader;//from w w w . j a v a 2 s. c o m int maxDoc = reader.maxDoc(); _list = new int[(maxDoc + MAX_SLOTS - 1) / MAX_SLOTS][]; _curPageNo = 0; _curSlot = 0; _curData = MAX_SLOTS; if (maxDoc > 0) { _curPage = new int[MAX_SLOTS * 2]; loadPayload(term); } _curPage = null; }
From source file:com.browseengine.bobo.test.BoboTestCase.java
License:Open Source License
private Directory createIndex() { RAMDirectory idxDir = new RAMDirectory(); try {/*from ww w . j ava 2 s. c o m*/ Document[] data = buildData(); TestDataDigester testDigester = new TestDataDigester(_fconf, data); BoboIndexer indexer = new BoboIndexer(testDigester, idxDir); indexer.index(); IndexReader r = IndexReader.open(idxDir, false); r.deleteDocument(r.maxDoc() - 1); //r.flush(); r.close(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return idxDir; }
From source file:com.browseengine.bobo.tools.CarDataDigest.java
License:Open Source License
public void digest(DataHandler handler) throws IOException { int numcars = getMaxDocs(); Random rand = new Random(); IndexReader reader = null; try {/*from ww w. ja va 2 s . co m*/ reader = IndexReader.open(FSDirectory.open(getDataFile()), true); int carcount = reader.maxDoc(); Document[] docCache = new Document[carcount]; for (int i = 0; i < carcount; ++i) { docCache[i] = reader.document(i); } for (int i = 0; i < numcars; ++i) { if (i != 0 && i % 1000 == 0) { System.out.println(i + " cars indexed."); } Document doc = new Document(); int n = rand.nextInt(10); if (n == 0) { makeCar(doc, cars[rand.nextInt(cars.length)]); } else { Document srcDoc = docCache[rand.nextInt(carcount)]; makeCar(doc, srcDoc); } populateDocument(doc, null); handler.handleDocument(doc); } } finally { if (reader != null) { reader.close(); } } }
From source file:com.browseengine.local.glue.GeoSearchFilter.java
License:Open Source License
/** * Broken at the poles./* ww w.j a va 2 s . c o m*/ * * NOTE that this isn't actually precise, since it means the result * fits in the box, not in the hypercircle. notably, for all * valid results, bits.get(i) is true. however, bits.get(i) might * be inside the box but ouside the radius of the search. * * The ratio of correct answers to incorrect answers, if taken on * a flat plane assuming a circle rather than elipse, is somewhere * around PI*r^2/(2*r)^2 = PI/4 = 0.785. * * We could also optionally compute a bit set within whose bounds * bits.get(i) implies it's a result, but !bits.get(i) doesn't * tell us if it's a result or not. * * However, the former gives us the option of refinement at scoring * time, in particular if the user has chosen to sort by distance. * * But hit counts and appearance or disappearance of results * during browse might lead to confusion, and necessitate actual * result set inclusion at this step. If this is the case, we * can use the inner and outer bounds as rules to only actually * compute distance here if it is between the inner box and outer * box. */ public BitSet makeBitSet(IndexReader reader) throws IOException { if (_rangeInMiles < 0f) { // all bits on by default int maxDoc = reader.maxDoc(); BitSet bits = new BitSet(maxDoc); bits.set(0, maxDoc); return bits; } return makeBitSetFast(reader); }
From source file:com.browseengine.local.glue.GeoSearchFilter.java
License:Open Source License
/** * The fastest way to make the bit set. The rule is that iff it is * a possible candidate for being a result, it is set to true. * This means that it just has to be inside the outer bounding "box" * created by the min/max lon/lat values that are possible as * resutls. Hence the returned bit set represents a set that * contains every result, but some of whom may not be within the * true distance specified (we estimate this represents on * average less than 22% of the total set size). * /*from w w w .ja v a 2 s .com*/ * @param reader * @return * @throws IOException */ public BitSet makeBitSetFast(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); BitSet bits = new BitSet(maxDoc); Locatable centroid = LonLat.getLonLatDeg(_lonDegrees, _latDegrees); // outer box only int[] bounds = HaversineWrapper.computeLonLatMinMaxAsInt(centroid, _rangeInMiles); int minLon = bounds[HaversineWrapper.LON_MIN]; int maxLon = bounds[HaversineWrapper.LON_MAX]; int minLat = bounds[HaversineWrapper.LAT_MIN]; int maxLat = bounds[HaversineWrapper.LAT_MAX]; int lonAsInt; int latAsInt; for (int i = 0; i < maxDoc; i++) { lonAsInt = _lonLats.lons[i]; latAsInt = _lonLats.lats[i]; if (lonAsInt >= minLon && lonAsInt <= maxLon && latAsInt >= minLat && latAsInt <= maxLat) { bits.set(i); } } return bits; }
From source file:com.browseengine.local.glue.GeoSearchFilter.java
License:Open Source License
/** * Broken at the poles. /*from ww w.j a va 2 s. c om*/ * * A more accurate representation of the result set, * computed by using actual distance measures for everything outside an * inner bounding box, but inside the outer bounding box. * The improved accuracy comes at a performance hit when compared to * {@link #makeBitSetFast(IndexReader). * * The inaccuracies * would come from an incorrect computation of the inner bounding box * (this should be improved upon if there's time--maybe just make it * a little smaller for added computation cost?). * * @param reader * @return * @throws IOException */ public BitSet makeBitSetMoreAccurate(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); BitSet bits = new BitSet(maxDoc); Locatable centroid = LonLat.getLonLatDeg(_lonDegrees, _latDegrees); // outer box int[] bounds = HaversineWrapper.computeLonLatMinMaxAsInt(centroid, _rangeInMiles); int minLon = bounds[HaversineWrapper.LON_MIN]; int maxLon = bounds[HaversineWrapper.LON_MAX]; int minLat = bounds[HaversineWrapper.LAT_MIN]; int maxLat = bounds[HaversineWrapper.LAT_MAX]; // inner box approximation, test all outside inner box int lonSpread = maxLon - minLon; lonSpread = (int) (Math.round(lonSpread / SQRT_TWO) / 2); int latSpread = maxLat - minLat; latSpread = (int) (Math.round(latSpread / SQRT_TWO) / 2); int lonAsInt = GeoSearchFields.dubToInt(_lonDegrees); int latAsInt = GeoSearchFields.dubToInt(_latDegrees); int iminLon = lonAsInt - lonSpread; int imaxLon = lonAsInt + lonSpread; int iminLat = latAsInt - latSpread; int imaxLat = latAsInt + latSpread; double centerLonRad = centroid.getLongitudeRad(); double centerLatRad = centroid.getLatitudeRad(); for (int i = 0; i < maxDoc; i++) { lonAsInt = _lonLats.lons[i]; latAsInt = _lonLats.lats[i]; if (lonAsInt >= minLon && lonAsInt <= maxLon && latAsInt >= minLat && latAsInt <= maxLat) { if (lonAsInt >= iminLon && lonAsInt <= imaxLon && latAsInt >= iminLat && latAsInt <= imaxLat) { bits.set(i); } else if (HaversineWrapper.computeHaversineDistanceMiles(centerLonRad, centerLatRad, lonAsInt, latAsInt) <= _rangeInMiles) { bits.set(i); } } } return bits; }
From source file:com.browseengine.local.glue.GeoSearchFilter.java
License:Open Source License
/** * Broken at the poles.//from w w w .j a v a 2 s. c o m * * Otherwise, this is an accurate representation of the true result set, but runs slower than * {@link #makeBitSetMoreAccurate(IndexReader)}. It computes the actual distance for every * member in the set, and includes it iff it is within the bounds. * * @param reader * @return * @throws IOException */ public BitSet makeBitSetCompletelyAccurate(IndexReader reader) throws IOException { int maxDoc = reader.maxDoc(); BitSet bits = new BitSet(maxDoc); Locatable centroid = LonLat.getLonLatDeg(_lonDegrees, _latDegrees); // outer box only int[] bounds = HaversineWrapper.computeLonLatMinMaxAsInt(centroid, _rangeInMiles); int minLon = bounds[HaversineWrapper.LON_MIN]; int maxLon = bounds[HaversineWrapper.LON_MAX]; int minLat = bounds[HaversineWrapper.LAT_MIN]; int maxLat = bounds[HaversineWrapper.LAT_MAX]; int lonAsInt; int latAsInt; double centerLonRad = centroid.getLongitudeRad(); double centerLatRad = centroid.getLatitudeRad(); for (int i = 0; i < maxDoc; i++) { lonAsInt = _lonLats.lons[i]; latAsInt = _lonLats.lats[i]; if (lonAsInt >= minLon && lonAsInt <= maxLon && latAsInt >= minLat && latAsInt <= maxLat) { if (HaversineWrapper.computeHaversineDistanceMiles(centerLonRad, centerLatRad, lonAsInt, latAsInt) <= _rangeInMiles) { bits.set(i); } } } return bits; }
From source file:com.browseengine.local.service.geoindex.GeoResourceWriter.java
License:Open Source License
public synchronized void optimize() throws IOException, GeoIndexingException { if (_writer != null) { if (_path != null) { _writer.optimize();/* w ww .j a v a2 s. c o m*/ File path2 = new File(_path.getParentFile(), _path.getName() + ".tmp"); _writer.close(); _writer = null; if (_path.renameTo(path2)) { IndexReader reader = null; TermEnum termEnum = null; TermDocs termDocs = null; try { reader = IndexReader.open(path2); int maxDoc = reader.maxDoc(); if (maxDoc <= 0) { throw new GeoIndexingException("can't optimize an index with " + maxDoc + " docs"); } LonDocid[] lonDocids = new LonDocid[maxDoc]; String fld = GeoSearchFields.LON.getField().intern(); Term term = new Term(fld, ""); termEnum = reader.terms(term); termDocs = reader.termDocs(); while ((term = termEnum.term()) != null && term.field() == fld) { double lon = Double.parseDouble(term.text()); termDocs.seek(term); while (termDocs.next()) { int docid = termDocs.doc(); lonDocids[docid] = new LonDocid(docid, lon); } termEnum.next(); } termDocs.close(); termDocs = null; termEnum.close(); termEnum = null; Arrays.sort(lonDocids); init(_path, true); for (int i = 0; i < lonDocids.length; i++) { int docid = lonDocids[i].docid; Document doc = reader.document(docid); // all fields are stored String name = doc.get(GeoSearchFields.NAME.getField()); String description = doc.get(GeoSearchFields.DESCRIPTION.getField()); String addressStr = doc.get(GeoSearchFields.ADDRESS.getField()); String phoneStr = doc.get(GeoSearchFields.PHONE.getField()); long phoneNumber = LocalResource.NO_PHONE_NUMBER; if (phoneStr != null && phoneStr.length() > 0) { phoneNumber = Long.parseLong(phoneStr); } String lonStr = doc.get(GeoSearchFields.LON.getField()); double lon = Double.parseDouble(lonStr); String latStr = doc.get(GeoSearchFields.LAT.getField()); double lat = Double.parseDouble(latStr); LocalResource resource = new LocalResource(name, description, addressStr, phoneNumber, lon, lat); addResource(resource); } reader.close(); reader = null; _writer.optimize(); LOGGER.info("successfully completed optimization of index at " + _path.getAbsolutePath()); } finally { try { // erase the tmp dir recursiveDelete(path2); } finally { try { if (reader != null) { reader.close(); } } finally { try { if (termEnum != null) { termEnum.close(); } } finally { try { if (termDocs != null) { termDocs.close(); } } finally { reader = null; termDocs = null; termEnum = null; } } } } } } else { init(_path, false); throw new GeoIndexingException("trouble doing the rename from " + _path.getAbsolutePath() + " to " + path2.getAbsolutePath() + "; check permissions"); } } else { _writer.optimize(); } } else { throw new GeoIndexingException("attempt to optimize a closed " + GeoResourceWriter.class.getName()); } }
From source file:com.browseengine.local.service.geosearch.GeoSearchImpl.java
License:Open Source License
/** * lon and lat are stored in the index as 7-decimal place precision values of * degrees.//from w w w. ja v a 2s . c om * 2^32 = 4,294,967,296 values. signed gives us -2^31 to 2^31-1. * if we multiply the normal value 179.123456 by 10^6, we get 179,123,456, * which fits in this space. * if we multiply the normal value 179.1234567 by 10^7, we get 1,791,234,567, * which fits in this space. * @throws IOException * @throws GeoSearchingException */ public static int[] loadDegreeFieldIntoInt(IndexReader reader, String fld) throws IOException, GeoSearchingException { int[] vals = new int[reader.maxDoc()]; TermEnum termEnum = null; TermDocs termDocs = null; try { fld = fld.intern(); Term term = new Term(fld, ""); termEnum = reader.terms(term); termDocs = reader.termDocs(); do { term = termEnum.term(); if (null == term || term.field() != fld) { break; } termDocs.seek(term); int numAdded = 0; while (termDocs.next()) { String str = term.text(); double dub = Double.parseDouble(str); vals[termDocs.doc()] = GeoSearchFields.dubToInt(dub); numAdded++; } if (numAdded <= 0) { throw new GeoSearchingException( "data integrity problem in field " + fld + ", term " + term.text()); } } while (termEnum.next()); return vals; } catch (NumberFormatException nfe) { throw new GeoSearchingException( "data integrity problem, non-numeric field value for field " + fld + ": " + nfe, nfe); } finally { try { if (termDocs != null) { termDocs.close(); } } finally { try { if (termEnum != null) { termEnum.close(); } } finally { // } } } }