List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:org.apache.gaelucene.tools.LuceneIndexPushUtil.java
License:Apache License
public static void main(String[] args) throws IOException { for (int i = 0; i < args.length; i++) { if ("-app-url".equals(args[i])) { gaeAppURL = args[++i];/*w ww . j a v a2 s.c o m*/ } else if ("-auth-cookie".equals(args[i])) { authCookie = args[++i]; } else if ("-src".equals(args[i])) { sourceDirName = args[++i]; } else if ("-cat".equals(args[i])) { category = args[++i]; } else if ("-rec-file".equals(args[i])) { jobRecFileName = args[++i]; } } if (gaeAppURL == null || authCookie == null || sourceDirName == null || category == null || jobRecFileName == null) { System.err.println(USAGE); System.exit(-1); } File sourceDir = new File(sourceDirName); if (!sourceDir.exists()) { System.err.println("'" + sourceDir.getAbsolutePath() + "' DOES NOT EXIST!"); System.exit(-1); } sourceDirName = sourceDir.getAbsolutePath(); // load filenames that have been uploaded successfully last time. HashSet<String> uploadedRec = new HashSet<String>(); File jobRecFile = new File(jobRecFileName); if (jobRecFile.exists()) { LineNumberReader reader = new LineNumberReader(new FileReader(jobRecFile)); for (String line = reader.readLine(); line != null;) { if (line.indexOf(" OK") > -1) { line = line.substring(0, line.indexOf(" ")).trim(); } uploadedRec.add(line); line = reader.readLine(); } reader.close(); } System.out.println("[INFO ] - trying to open index under " + sourceDirName); IndexReader indexReader = IndexReader.open(sourceDir); int maxDoc = indexReader.maxDoc(); int numDocs = indexReader.numDocs(); long version = indexReader.getVersion(); boolean hasDeletions = indexReader.hasDeletions(); boolean isOptimized = indexReader.isOptimized(); System.out.println("maxDoc:" + maxDoc); System.out.println("numDocs:" + numDocs); System.out.println("version:" + version); System.out.println("hasDeletions:" + hasDeletions); System.out.println("isOptimized:" + isOptimized); // record filenames that were uploaded successfully BufferedWriter dataWriter = new BufferedWriter( new OutputStreamWriter(new FileOutputStream(jobRecFile, true))); System.out.println("[INFO ] - trying to synchronize the index files onto gae..."); File[] files = sourceDir.listFiles(); for (int i = 0; i < files.length; i++) { File file = files[i]; if (uploadedRec.contains(file.getName())) { System.out.println("[INFO ] - skip file '" + file.getName() + "'"); continue; } try { commitFile(file, category, version, i); dataWriter.write(file.getName() + " OK\n"); } catch (IOException ioe) { System.out.println("[WARN ] - failed to upload '" + file.getName() + "', because:" + ioe); } } dataWriter.flush(); dataWriter.close(); System.out.println("[INFO ] - trying to activate the index..."); try { activateIndex(category, version); } catch (IOException ioe) { System.out.println("[WARN ] - failed to activate the index, because:" + ioe); } }
From source file:org.apache.jackrabbit.core.query.lucene.MultiIndex.java
License:Apache License
/** * Returns the number of documents in this index. * * @return the number of documents in this index. * @throws IOException if an error occurs while reading from the index. *//*from w w w. j a v a 2 s .c o m*/ int numDocs() throws IOException { if (indexNames.size() == 0) { return volatileIndex.getNumDocuments(); } else { IndexReader reader = getIndexReader(); try { return reader.numDocs(); } finally { reader.close(); } } }
From source file:org.apache.jackrabbit.core.query.lucene.IndexFormatVersion.java
License:Apache License
/** * @return the index format version of the index used by the given * index reader.//w w w. j a v a 2s. c om */ public static IndexFormatVersion getVersion(IndexReader indexReader) { Collection<String> fields = ReaderUtil.getIndexedFields(indexReader); if (fields.contains(FieldNames.LOCAL_NAME) || indexReader.numDocs() == 0) { return IndexFormatVersion.V3; } else if (fields.contains(FieldNames.PROPERTIES_SET)) { return IndexFormatVersion.V2; } else { return IndexFormatVersion.V1; } }
From source file:org.apache.mahout.utils.vectors.lucene.AbstractLuceneIterator.java
License:Apache License
public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight, double maxPercentErrorDocs, String field) { this.terminfo = terminfo; this.normPower = normPower; this.indexReader = indexReader; this.weight = weight; this.nextDocId = 0; this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs()); this.field = field; }
From source file:org.apache.mahout.utils.vectors.lucene.CachedTermInfo.java
License:Apache License
public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException { this.field = field; Terms t = MultiFields.getTerms(reader, field); TermsEnum te = t.iterator(null);//from w w w . j a v a 2 s.c om int numDocs = reader.numDocs(); double percent = numDocs * maxDfPercent / 100.0; //Should we use a linked hash map so that we know terms are in order? termEntries = Maps.newLinkedHashMap(); int count = 0; BytesRef text; while ((text = te.next()) != null) { int df = te.docFreq(); if (df >= minDf && df <= percent) { TermEntry entry = new TermEntry(text.utf8ToString(), count++, df); termEntries.put(entry.getTerm(), entry); } } }
From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java
License:Apache License
/** * Get the list of labels, sorted by best score. *//*from www . ja va 2s . c o m*/ protected List<TermInfoClusterInOut> getClusterLabels(Integer integer, Collection<WeightedPropertyVectorWritable> wpvws) throws IOException { if (wpvws.size() < minNumIds) { log.info("Skipping small cluster {} with size: {}", integer, wpvws.size()); return null; } log.info("Processing Cluster {} with {} documents", integer, wpvws.size()); Directory dir = FSDirectory.open(new File(this.indexDir)); IndexReader reader = DirectoryReader.open(dir); log.info("# of documents in the index {}", reader.numDocs()); Collection<String> idSet = Sets.newHashSet(); for (WeightedPropertyVectorWritable wpvw : wpvws) { Vector vector = wpvw.getVector(); if (vector instanceof NamedVector) { idSet.add(((NamedVector) vector).getName()); } } int numDocs = reader.numDocs(); OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField); log.info("Populating term infos from the index"); /** * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency. * * Since we have deleted the documents out of the cluster, the document frequency for a term should only * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency * in the entire index. To get the in-cluster frequency, we need to query the index to get the term * frequencies in each document. The number of results of this call will be the in-cluster document * frequency. */ Terms t = MultiFields.getTerms(reader, contentField); TermsEnum te = t.iterator(null); Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>(); Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions int count = 0; BytesRef term; while ((term = te.next()) != null) { OpenBitSet termBitset = new OpenBitSet(reader.maxDoc()); DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term); int docID; while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //check to see if we don't have an deletions (null) or if document is live if (liveDocs != null && !liveDocs.get(docID)) { // document is deleted... termBitset.set(docsEnum.docID()); } } // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency. // This modifies the termBitset, but that's fine as we are not using it anywhere else. termBitset.and(clusterDocBitset); int inclusterDF = (int) termBitset.cardinality(); TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF); termEntryMap.put(entry.getTerm(), entry); } List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList(); int clusterSize = wpvws.size(); for (TermEntry termEntry : termEntryMap.values()) { int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm())); int outDF = corpusDF - termEntry.getDocFreq(); int inDF = termEntry.getDocFreq(); double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs); TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF, logLikelihoodRatio); clusteredTermInfo.add(termInfoCluster); } Collections.sort(clusteredTermInfo); // Cleanup Closeables.close(reader, true); termEntryMap.clear(); return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels)); }
From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java
License:Apache License
private static OpenBitSet getClusterDocBitset(IndexReader reader, Collection<String> idSet, String idField) throws IOException { int numDocs = reader.numDocs(); OpenBitSet bitset = new OpenBitSet(numDocs); Set<String> idFieldSelector = null; if (idField != null) { idFieldSelector = new TreeSet<String>(); idFieldSelector.add(idField);/*w ww.j a v a 2 s .c om*/ } for (int i = 0; i < numDocs; i++) { String id; // Use Lucene's internal ID if idField is not specified. Else, get it from the document. if (idField == null) { id = Integer.toString(i); } else { id = reader.document(i, idFieldSelector).get(idField); } if (idSet.contains(id)) { bitset.set(i); } } log.info("Created bitset for in-cluster documents : {}", bitset.cardinality()); return bitset; }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
private void hashDuplicatesHelper(Path index, String url) throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index }); FsDirectory dir = new FsDirectory(fs, new Path(index, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }//from w ww . j a v a 2 s . c o m Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", url, doc.get("url")); System.out.println(doc); } reader.close(); }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testUrlDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }//ww w. j ava 2s . co m Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }
From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java
License:Apache License
public void testMixedDuplicates() throws Exception { DeleteDuplicates dedup = new DeleteDuplicates(conf); dedup.dedup(new Path[] { index1, index2 }); FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf); IndexReader reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; }//from w w w.jav a2 s .co m Document doc = reader.document(i); // make sure we got the right one assertEquals("check url", "http://www.example.com/2", doc.get("url")); System.out.println(doc); } reader.close(); dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf); reader = IndexReader.open(dir); assertEquals("only one doc left", reader.numDocs(), 1); MD5Hash hash = MD5Hash.digest("2"); for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) { System.out.println("-doc " + i + " deleted"); continue; } Document doc = reader.document(i); // make sure we got the right one assertEquals("check hash", hash.toString(), doc.get("digest")); System.out.println(doc); } reader.close(); }