List of usage examples for org.apache.lucene.index IndexReader maxDoc
public abstract int maxDoc();
From source file:org.apache.nutch.indexer.IndexSorterArquivoWeb.java
License:Apache License
/** * Sort the documents by score/*w ww. jav a2 s . c o m*/ * @param reader * @param searcher * @return * @throws IOException */ //private static int[] oldToNew(IndexReader reader, Searcher searcher) throws IOException { private static DocScore[] newToOld(IndexReader reader, Searcher searcher) throws IOException { int readerMax = reader.maxDoc(); DocScore[] newToOld = new DocScore[readerMax]; // use site, an indexed, un-tokenized field to get boost //byte[] boosts = reader.norms("site"); TODO MC /* TODO MC */ Document docMeta; Pattern includes = Pattern.compile("\\|"); String value = NutchConfiguration.create().get(INCLUDE_EXTENSIONS_KEY, ""); String includeExtensions[] = includes.split(value); Hashtable<String, Boolean> validExtensions = new Hashtable<String, Boolean>(); for (int i = 0; i < includeExtensions.length; i++) { validExtensions.put(includeExtensions[i], true); System.out.println("extension boosted " + includeExtensions[i]); } /* TODO MC */ for (int oldDoc = 0; oldDoc < readerMax; oldDoc++) { float score; if (reader.isDeleted(oldDoc)) { //score = 0.0f; score = -1f; // TODO MC } else { //score = Similarity.decodeNorm(boosts[oldDoc]); TODO MC /* TODO MC */ docMeta = searcher.doc(oldDoc); if (validExtensions.get(docMeta.get("subType")) == null) { // searched extensions will have higher scores score = -0.5f; } else { score = Integer.parseInt(docMeta.get("inlinks")); /* if (score==0) { score=0.001f; // TODO MC - to not erase } */ } /* TODO MC */ //System.out.println("Score for old document "+oldDoc+" is "+score+" and type "+docMeta.get("subType")); // TODO MC debug remove } DocScore docScore = new DocScore(); docScore.doc = oldDoc; docScore.score = score; newToOld[oldDoc] = docScore; } System.out.println("Sorting " + newToOld.length + " documents."); Arrays.sort(newToOld); //HeapSorter.sort(newToOld); // TODO MC - due to the lack of space /* TODO MC int[] oldToNew = new int[readerMax]; for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; //oldToNew[docScore.oldDoc] = docScore.score > 0.0f ? newDoc : -1; // TODO MC oldToNew[docScore.oldDoc] = newDoc; // TODO MC } */ /* TODO MC * for (int newDoc = 0; newDoc < readerMax; newDoc++) { DocScore docScore = newToOld[newDoc]; System.out.println("Score for new document "+newDoc+" is "+docScore.score); // TODO MC debug remove } * TODO MC */ //return oldToNew; TODO MC return newToOld; // TODO MC }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
private void hashDuplicatesHelper(Path index, String url) throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index }); FsDirectory dir = new FsDirectory(fs, new Path(index, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }//from ww w .j av a 2 s . co m Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", url, doc.get("url")); System.out.println(doc); } reader.close(); }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testUrlDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }//w ww . j a v a 2s. c o m Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testMixedDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index1, index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }/* ww w.ja v a 2s .c o m*/ Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", "http://www.example.com/2", doc.get("url")); System.out.println(doc); } reader.close(); dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }
From source file:org.apache.nutch.indexer.TestIndexSorter.java
License:Apache License
public void testSorting() throws Exception { IndexSorter sorter = new IndexSorter(conf); sorter.sort(testDir);/*from w w w .ja v a 2 s. co m*/ // read back documents IndexReader reader = IndexReader.open(FSDirectory.open(new File(testDir, INDEX_SORTED))); assertEquals(reader.numDocs(), NUM_DOCS); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i); Field f = doc.getField("content"); assertNull(f); f = doc.getField("boost"); float boost = Similarity.decodeNorm((byte) (NUM_DOCS - i)); String cmp = String.valueOf(boost); assertEquals(cmp, f.stringValue()); } reader.close(); }
From source file:org.apache.nutch.tools.SegmentMergeTool.java
License:Apache License
/** Run the tool, periodically reporting progress. */ public void run() { start = System.currentTimeMillis(); stage = SegmentMergeStatus.STAGE_OPENING; long delta;/* w w w.j a va 2 s. co m*/ LOG.info("* Opening " + allsegdirs.size() + " segments:"); try { segdirs = new ArrayList(); // open all segments for (int i = 0; i < allsegdirs.size(); i++) { File dir = (File) allsegdirs.get(i); SegmentReader sr = null; try { // try to autofix it if corrupted... sr = new SegmentReader(nfs, dir, true); } catch (Exception e) { // this segment is hosed beyond repair, don't use it LOG.warning("* Segment " + dir.getName() + " is corrupt beyond repair; skipping it."); continue; } segdirs.add(dir); totalRecords += sr.size; LOG.info(" - segment " + dir.getName() + ": " + sr.size + " records."); readers.put(dir.getName(), sr); } long total = totalRecords; LOG.info("* TOTAL " + total + " input records in " + segdirs.size() + " segments."); LOG.info("* Creating master index..."); stage = SegmentMergeStatus.STAGE_MASTERIDX; // XXX Note that Lucene indexes don't work with NutchFileSystem for now. // XXX For now always assume LocalFileSystem here... Vector masters = new Vector(); File fsmtIndexDir = new File(output, ".fastmerge_index"); File masterDir = new File(fsmtIndexDir, "0"); if (!masterDir.mkdirs()) { LOG.severe("Could not create a master index dir: " + masterDir); return; } masters.add(masterDir); IndexWriter iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true); iw.setUseCompoundFile(false); iw.setMergeFactor(INDEX_MERGE_FACTOR); iw.setRAMBufferSizeMB(INDEX_MIN_MERGE_DOCS); long s1 = System.currentTimeMillis(); Iterator it = readers.values().iterator(); processedRecords = 0L; delta = System.currentTimeMillis(); while (it.hasNext()) { SegmentReader sr = (SegmentReader) it.next(); String name = sr.segmentDir.getName(); FetcherOutput fo = new FetcherOutput(); for (long i = 0; i < sr.size; i++) { try { if (!sr.get(i, fo, null, null, null)) break; Document doc = new Document(); // compute boost float boost = IndexSegment.calculateBoost(fo.getFetchListEntry().getPage().getScore(), scorePower, boostByLinkCount, fo.getAnchors().length); // doc.add(new Field("sd", name + "|" + i, true, false, false)); // doc.add(new Field("uh", MD5Hash.digest(fo.getUrl().toString()).toString(), true, true, false)); // doc.add(new Field("ch", fo.getMD5Hash().toString(), true, true, false)); // doc.add(new Field("time", DateField.timeToString(fo.getFetchDate()), true, false, false)); // doc.add(new Field("score", boost + "", true, false, false)); // doc.add(new Field("ul", fo.getUrl().toString().length() + "", true, false, false)); iw.addDocument(doc); processedRecords++; if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) { LOG.info(" Processed " + processedRecords + " records (" + (float) (LOG_STEP * 1000) / (float) (System.currentTimeMillis() - delta) + " rec/s)"); delta = System.currentTimeMillis(); } if (processedRecords > 0 && (processedRecords % INDEX_SIZE == 0)) { iw.optimize(); iw.close(); LOG.info(" - creating next subindex..."); masterDir = new File(fsmtIndexDir, "" + masters.size()); if (!masterDir.mkdirs()) { LOG.severe("Could not create a master index dir: " + masterDir); return; } masters.add(masterDir); iw = new IndexWriter(masterDir, new WhitespaceAnalyzer(), true); iw.setUseCompoundFile(false); iw.setMergeFactor(INDEX_MERGE_FACTOR); iw.setRAMBufferSizeMB(INDEX_MIN_MERGE_DOCS); } } catch (Throwable t) { // we can assume the data is invalid from now on - break here LOG.info(" - segment " + name + " truncated to " + (i + 1) + " records"); break; } } } iw.optimize(); LOG.info("* Creating index took " + (System.currentTimeMillis() - s1) + " ms"); s1 = System.currentTimeMillis(); // merge all other indexes using the latest IndexWriter (still open): if (masters.size() > 1) { LOG.info(" - merging subindexes..."); stage = SegmentMergeStatus.STAGE_MERGEIDX; IndexReader[] ireaders = new IndexReader[masters.size() - 1]; for (int i = 0; i < masters.size() - 1; i++) ireaders[i] = IndexReader.open((File) masters.get(i)); iw.addIndexes(ireaders); for (int i = 0; i < masters.size() - 1; i++) { ireaders[i].close(); FileUtil.fullyDelete((File) masters.get(i)); } } iw.close(); LOG.info("* Optimizing index took " + (System.currentTimeMillis() - s1) + " ms"); LOG.info("* Removing duplicate entries..."); stage = SegmentMergeStatus.STAGE_DEDUP; IndexReader ir = IndexReader.open(masterDir); int i = 0; long cnt = 0L; processedRecords = 0L; s1 = System.currentTimeMillis(); delta = s1; TermEnum te = ir.terms(); while (te.next()) { Term t = te.term(); if (t == null) continue; if (!(t.field().equals("ch") || t.field().equals("uh"))) continue; cnt++; processedRecords = cnt / 2; if (cnt > 0 && (cnt % (LOG_STEP * 2) == 0)) { LOG.info(" Processed " + processedRecords + " records (" + (float) (LOG_STEP * 1000) / (float) (System.currentTimeMillis() - delta) + " rec/s)"); delta = System.currentTimeMillis(); } // Enumerate all docs with the same URL hash or content hash TermDocs td = ir.termDocs(t); if (td == null) continue; if (t.field().equals("uh")) { // Keep only the latest version of the document with // the same url hash. Note: even if the content // hash is identical, other metadata may be different, so even // in this case it makes sense to keep the latest version. int id = -1; String time = null; Document doc = null; while (td.next()) { int docid = td.doc(); if (!ir.isDeleted(docid)) { doc = ir.document(docid); if (time == null) { time = doc.get("time"); id = docid; continue; } String dtime = doc.get("time"); // "time" is a DateField, and can be compared lexicographically if (dtime.compareTo(time) > 0) { if (id != -1) { ir.deleteDocument(id); } time = dtime; id = docid; } else { ir.deleteDocument(docid); } } } } else if (t.field().equals("ch")) { // Keep only the version of the document with // the highest score, and then with the shortest url. int id = -1; int ul = 0; float score = 0.0f; Document doc = null; while (td.next()) { int docid = td.doc(); if (!ir.isDeleted(docid)) { doc = ir.document(docid); if (ul == 0) { try { ul = Integer.parseInt(doc.get("ul")); score = Float.parseFloat(doc.get("score")); } catch (Exception e) { } ; id = docid; continue; } int dul = 0; float dscore = 0.0f; try { dul = Integer.parseInt(doc.get("ul")); dscore = Float.parseFloat(doc.get("score")); } catch (Exception e) { } ; int cmp = Float.compare(dscore, score); if (cmp == 0) { // equal scores, select the one with shortest url if (dul < ul) { if (id != -1) { ir.deleteDocument(id); } ul = dul; id = docid; } else { ir.deleteDocument(docid); } } else if (cmp < 0) { ir.deleteDocument(docid); } else { if (id != -1) { ir.deleteDocument(id); } ul = dul; id = docid; } } } } } // // keep the IndexReader open... // LOG.info("* Deduplicating took " + (System.currentTimeMillis() - s1) + " ms"); stage = SegmentMergeStatus.STAGE_WRITING; processedRecords = 0L; Vector outDirs = new Vector(); File outDir = new File(output, SegmentWriter.getNewSegmentName()); outDirs.add(outDir); LOG.info("* Merging all segments into " + output.getName()); s1 = System.currentTimeMillis(); delta = s1; nfs.mkdirs(outDir); SegmentWriter sw = new SegmentWriter(nfs, outDir, true); LOG.fine(" - opening first output segment in " + outDir.getName()); FetcherOutput fo = new FetcherOutput(); Content co = new Content(); ParseText pt = new ParseText(); ParseData pd = new ParseData(); int outputCnt = 0; for (int n = 0; n < ir.maxDoc(); n++) { if (ir.isDeleted(n)) { //System.out.println("-del"); continue; } Document doc = ir.document(n); String segDoc = doc.get("sd"); int idx = segDoc.indexOf('|'); String segName = segDoc.substring(0, idx); String docName = segDoc.substring(idx + 1); SegmentReader sr = (SegmentReader) readers.get(segName); long docid; try { docid = Long.parseLong(docName); } catch (Exception e) { continue; } try { // get data from the reader sr.get(docid, fo, co, pt, pd); } catch (Throwable thr) { // don't break the loop, because only one of the segments // may be corrupted... LOG.fine(" - corrupt record no. " + docid + " in segment " + sr.segmentDir.getName() + " - skipping."); continue; } sw.append(fo, co, pt, pd); outputCnt++; processedRecords++; if (processedRecords > 0 && (processedRecords % LOG_STEP == 0)) { LOG.info(" Processed " + processedRecords + " records (" + (float) (LOG_STEP * 1000) / (float) (System.currentTimeMillis() - delta) + " rec/s)"); delta = System.currentTimeMillis(); } if (processedRecords % maxCount == 0) { sw.close(); outDir = new File(output, SegmentWriter.getNewSegmentName()); LOG.fine(" - starting next output segment in " + outDir.getName()); nfs.mkdirs(outDir); sw = new SegmentWriter(nfs, outDir, true); outDirs.add(outDir); } } LOG.info("* Merging took " + (System.currentTimeMillis() - s1) + " ms"); ir.close(); sw.close(); FileUtil.fullyDelete(fsmtIndexDir); for (Iterator iter = readers.keySet().iterator(); iter.hasNext();) { SegmentReader sr = (SegmentReader) readers.get(iter.next()); sr.close(); } if (runIndexer) { stage = SegmentMergeStatus.STAGE_INDEXING; totalRecords = outDirs.size(); processedRecords = 0L; LOG.info("* Creating new segment index(es)..."); File workingDir = new File(output, "indexsegment-workingdir"); for (int k = 0; k < outDirs.size(); k++) { processedRecords++; if (workingDir.exists()) { FileUtil.fullyDelete(workingDir); } IndexSegment indexer = new IndexSegment(nfs, Integer.MAX_VALUE, (File) outDirs.get(k), workingDir); indexer.indexPages(); FileUtil.fullyDelete(workingDir); } } if (delSegs) { // This deletes also all corrupt segments, which are // unusable anyway stage = SegmentMergeStatus.STAGE_DELETING; totalRecords = allsegdirs.size(); processedRecords = 0L; LOG.info("* Deleting old segments..."); for (int k = 0; k < allsegdirs.size(); k++) { processedRecords++; FileUtil.fullyDelete((File) allsegdirs.get(k)); } } delta = System.currentTimeMillis() - start; float eps = (float) total / (float) (delta / 1000); LOG.info("Finished SegmentMergeTool: INPUT: " + total + " -> OUTPUT: " + outputCnt + " entries in " + ((float) delta / 1000f) + " s (" + eps + " entries/sec)."); } catch (Exception e) { e.printStackTrace(); LOG.severe(e.getMessage()); } }
From source file:org.apache.solr.codecs.test.testGetStoredFields.java
License:Apache License
public static void getDoc(String searchField, String searchString) throws IOException, ParseException { System.out.println("Searching for '" + searchString + "'"); Directory luceneDir = new ONSQLWrapperDirectory(new File(INDEX_ROOT_FOLDER)); IndexReader indexReader = DirectoryReader.open(luceneDir); IndexSearcher indexSearcher = new IndexSearcher(indexReader); TotalHitCountCollector hitCountCollector = new TotalHitCountCollector(); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_1); QueryParser queryParser = new QueryParser(Version.LUCENE_4_10_1, searchField, analyzer); Query query = queryParser.parse(searchString); indexSearcher.search(query, hitCountCollector); System.out.println("Word: " + searchString + "; Number of hits: " + hitCountCollector.getTotalHits()); System.out.println("maxdocs=" + indexReader.maxDoc()); org.apache.lucene.search.TopDocs docs = indexSearcher.search(query, 100); for (int i = 0; i < docs.scoreDocs.length; i++) { Document doc1 = indexReader.document(docs.scoreDocs[i].doc); System.out.println("title=" + doc1.get("title")); System.out.println("content=" + doc1.get("content")); System.out.println("global_bu_id=" + doc1.get("global_bu_id")); System.out.println("omega_order_num=" + doc1.get("omega_order_num")); System.out.println("------"); }/*from w ww .j a v a2s. c o m*/ luceneDir.close(); }
From source file:org.apache.solr.request.uninverted.UnInvertedField.java
License:Apache License
public static BitDocSet ajustBase(int times, BitDocSet baseAdvanceDocs, IndexReader reader) { try {// w w w . j a v a2 s . c o m if (baseAdvanceDocs == null) { return null; } int maxdoc = reader.maxDoc(); int oversize = maxdoc / times; int size = baseAdvanceDocs.size(); int maxinterval = reader.getMaxInterval(); log.info("ajustBase " + maxinterval + ",baseAdvanceDocs=" + size + "@" + oversize + "@" + maxdoc + "," + reader.getClass().getCanonicalName()); if (size >= oversize || maxinterval > 256) { return null; } } catch (Exception e) { return null; } return baseAdvanceDocs; }
From source file:org.apache.solr.request.uninverted.UnInvertedField.java
License:Apache License
public static BitDocSet cloneBitset(DocSet baseAdvanceDocs, IndexReader reader) { if (baseAdvanceDocs instanceof BitDocSet) { BitDocSet rtn = (BitDocSet) baseAdvanceDocs; OpenBitSet newbits = (OpenBitSet) (rtn.getBits().clone()); return new BitDocSet(newbits, rtn.size()); }/* w ww .j a v a 2s. c o m*/ OpenBitSet bs = new OpenBitSet(reader.maxDoc()); DocIterator iter = baseAdvanceDocs.iterator(); int pos = 0; while (iter.hasNext()) { bs.fastSet(iter.nextDoc()); pos++; } return new BitDocSet(bs, pos); }
From source file:org.apache.solr.search.function.FileFloatSource.java
License:Apache License
private static float[] getFloats(FileFloatSource ffs, IndexReader reader) { float[] vals = new float[reader.maxDoc()]; if (ffs.defVal != 0) { Arrays.fill(vals, ffs.defVal); }//from www. j a v a2s .c om InputStream is; String fname = "external_" + ffs.field.getName(); try { is = VersionedFile.getLatestFile(ffs.dataDir, fname); } catch (IOException e) { // log, use defaults SolrCore.log.error("Error opening external value source file: " + e); return vals; } BufferedReader r = new BufferedReader(new InputStreamReader(is, IOUtils.CHARSET_UTF_8)); String idName = ffs.keyField.getName(); FieldType idType = ffs.keyField.getType(); // warning: lucene's termEnum.skipTo() is not optimized... it simply does a next() // because of this, simply ask the reader for a new termEnum rather than // trying to use skipTo() List<String> notFound = new ArrayList<String>(); int notFoundCount = 0; int otherErrors = 0; char delimiter = '='; BytesRef internalKey = new BytesRef(); try { TermsEnum termsEnum = MultiFields.getTerms(reader, idName).iterator(null); DocsEnum docsEnum = null; // removing deleted docs shouldn't matter // final Bits liveDocs = MultiFields.getLiveDocs(reader); for (String line; (line = r.readLine()) != null;) { int delimIndex = line.lastIndexOf(delimiter); if (delimIndex < 0) continue; int endIndex = line.length(); String key = line.substring(0, delimIndex); String val = line.substring(delimIndex + 1, endIndex); float fval; try { idType.readableToIndexed(key, internalKey); fval = Float.parseFloat(val); } catch (Exception e) { if (++otherErrors <= 10) { SolrCore.log.error("Error loading external value source + fileName + " + e + (otherErrors < 10 ? "" : "\tSkipping future errors for this file.")); } continue; // go to next line in file.. leave values as default. } if (!termsEnum.seekExact(internalKey)) { if (notFoundCount < 10) { // collect first 10 not found for logging notFound.add(key); } notFoundCount++; continue; } docsEnum = termsEnum.docs(null, docsEnum, DocsEnum.FLAG_NONE); int doc; while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { vals[doc] = fval; } } } catch (IOException e) { // log, use defaults SolrCore.log.error("Error loading external value source: " + e); } finally { // swallow exceptions on close so we don't override any // exceptions that happened in the loop try { r.close(); } catch (Exception e) { } } SolrCore.log.info("Loaded external value source " + fname + (notFoundCount == 0 ? "" : " :" + notFoundCount + " missing keys " + notFound)); return vals; }