Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:org.apache.gaelucene.tools.LuceneIndexPushUtil.java

License:Apache License

public static void main(String[] args) throws IOException {
    for (int i = 0; i < args.length; i++) {
        if ("-app-url".equals(args[i])) {
            gaeAppURL = args[++i];/*w  ww .  j  a v a2 s.c o m*/
        } else if ("-auth-cookie".equals(args[i])) {
            authCookie = args[++i];
        } else if ("-src".equals(args[i])) {
            sourceDirName = args[++i];
        } else if ("-cat".equals(args[i])) {
            category = args[++i];
        } else if ("-rec-file".equals(args[i])) {
            jobRecFileName = args[++i];
        }
    }

    if (gaeAppURL == null || authCookie == null || sourceDirName == null || category == null
            || jobRecFileName == null) {
        System.err.println(USAGE);
        System.exit(-1);
    }

    File sourceDir = new File(sourceDirName);
    if (!sourceDir.exists()) {
        System.err.println("'" + sourceDir.getAbsolutePath() + "' DOES NOT EXIST!");
        System.exit(-1);
    }
    sourceDirName = sourceDir.getAbsolutePath();

    // load filenames that have been uploaded successfully last time.
    HashSet<String> uploadedRec = new HashSet<String>();
    File jobRecFile = new File(jobRecFileName);
    if (jobRecFile.exists()) {
        LineNumberReader reader = new LineNumberReader(new FileReader(jobRecFile));
        for (String line = reader.readLine(); line != null;) {
            if (line.indexOf(" OK") > -1) {
                line = line.substring(0, line.indexOf(" ")).trim();
            }
            uploadedRec.add(line);
            line = reader.readLine();
        }
        reader.close();
    }

    System.out.println("[INFO ] - trying to open index under " + sourceDirName);
    IndexReader indexReader = IndexReader.open(sourceDir);
    int maxDoc = indexReader.maxDoc();
    int numDocs = indexReader.numDocs();
    long version = indexReader.getVersion();
    boolean hasDeletions = indexReader.hasDeletions();
    boolean isOptimized = indexReader.isOptimized();

    System.out.println("maxDoc:" + maxDoc);
    System.out.println("numDocs:" + numDocs);
    System.out.println("version:" + version);
    System.out.println("hasDeletions:" + hasDeletions);
    System.out.println("isOptimized:" + isOptimized);

    // record filenames that were uploaded successfully
    BufferedWriter dataWriter = new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(jobRecFile, true)));
    System.out.println("[INFO ] - trying to synchronize the index files onto gae...");
    File[] files = sourceDir.listFiles();
    for (int i = 0; i < files.length; i++) {
        File file = files[i];
        if (uploadedRec.contains(file.getName())) {
            System.out.println("[INFO ] - skip file '" + file.getName() + "'");
            continue;
        }
        try {
            commitFile(file, category, version, i);
            dataWriter.write(file.getName() + " OK\n");
        } catch (IOException ioe) {
            System.out.println("[WARN ] - failed to upload '" + file.getName() + "', because:" + ioe);
        }
    }
    dataWriter.flush();
    dataWriter.close();

    System.out.println("[INFO ] - trying to activate the index...");
    try {
        activateIndex(category, version);
    } catch (IOException ioe) {
        System.out.println("[WARN ] - failed to activate the index, because:" + ioe);
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.MultiIndex.java

License:Apache License

/**
 * Returns the number of documents in this index.
 *
 * @return the number of documents in this index.
 * @throws IOException if an error occurs while reading from the index.
 *//*from   w  w w. j a v a  2 s  .c o  m*/
int numDocs() throws IOException {
    if (indexNames.size() == 0) {
        return volatileIndex.getNumDocuments();
    } else {
        IndexReader reader = getIndexReader();
        try {
            return reader.numDocs();
        } finally {
            reader.close();
        }
    }
}

From source file:org.apache.jackrabbit.core.query.lucene.IndexFormatVersion.java

License:Apache License

/**
 * @return the index format version of the index used by the given
 * index reader.//w w  w.  j a v a 2s. c  om
 */
public static IndexFormatVersion getVersion(IndexReader indexReader) {
    Collection<String> fields = ReaderUtil.getIndexedFields(indexReader);
    if (fields.contains(FieldNames.LOCAL_NAME) || indexReader.numDocs() == 0) {
        return IndexFormatVersion.V3;
    } else if (fields.contains(FieldNames.PROPERTIES_SET)) {
        return IndexFormatVersion.V2;
    } else {
        return IndexFormatVersion.V1;
    }
}

From source file:org.apache.mahout.utils.vectors.lucene.AbstractLuceneIterator.java

License:Apache License

public AbstractLuceneIterator(TermInfo terminfo, double normPower, IndexReader indexReader, Weight weight,
        double maxPercentErrorDocs, String field) {
    this.terminfo = terminfo;
    this.normPower = normPower;
    this.indexReader = indexReader;

    this.weight = weight;
    this.nextDocId = 0;
    this.maxErrorDocs = (int) (maxPercentErrorDocs * indexReader.numDocs());
    this.field = field;
}

From source file:org.apache.mahout.utils.vectors.lucene.CachedTermInfo.java

License:Apache License

public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
    this.field = field;
    Terms t = MultiFields.getTerms(reader, field);
    TermsEnum te = t.iterator(null);//from w  w  w  . j  a  v a  2  s.c om

    int numDocs = reader.numDocs();
    double percent = numDocs * maxDfPercent / 100.0;
    //Should we use a linked hash map so that we know terms are in order?
    termEntries = Maps.newLinkedHashMap();
    int count = 0;
    BytesRef text;
    while ((text = te.next()) != null) {
        int df = te.docFreq();
        if (df >= minDf && df <= percent) {
            TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
            termEntries.put(entry.getTerm(), entry);
        }
    }
}

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

License:Apache License

/**
 * Get the list of labels, sorted by best score.
 *//*from  www .  ja  va 2s .  c o  m*/
protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
        Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {

    if (wpvws.size() < minNumIds) {
        log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
        return null;
    }

    log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
    Directory dir = FSDirectory.open(new File(this.indexDir));
    IndexReader reader = DirectoryReader.open(dir);

    log.info("# of documents in the index {}", reader.numDocs());

    Collection<String> idSet = Sets.newHashSet();
    for (WeightedPropertyVectorWritable wpvw : wpvws) {
        Vector vector = wpvw.getVector();
        if (vector instanceof NamedVector) {
            idSet.add(((NamedVector) vector).getName());
        }
    }

    int numDocs = reader.numDocs();

    OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);

    log.info("Populating term infos from the index");

    /**
     * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
     * 
     * Since we have deleted the documents out of the cluster, the document frequency for a term should only
     * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
     * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
     * frequencies in each document. The number of results of this call will be the in-cluster document
     * frequency.
     */
    Terms t = MultiFields.getTerms(reader, contentField);
    TermsEnum te = t.iterator(null);
    Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>();
    Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions

    int count = 0;
    BytesRef term;
    while ((term = te.next()) != null) {
        OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
        DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
        int docID;
        while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //check to see if we don't have an deletions (null) or if document is live
            if (liveDocs != null && !liveDocs.get(docID)) {
                // document is deleted...
                termBitset.set(docsEnum.docID());
            }
        }
        // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
        // This modifies the termBitset, but that's fine as we are not using it anywhere else.
        termBitset.and(clusterDocBitset);
        int inclusterDF = (int) termBitset.cardinality();

        TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
        termEntryMap.put(entry.getTerm(), entry);

    }

    List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList();

    int clusterSize = wpvws.size();

    for (TermEntry termEntry : termEntryMap.values()) {

        int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm()));
        int outDF = corpusDF - termEntry.getDocFreq();
        int inDF = termEntry.getDocFreq();
        double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
        TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF,
                logLikelihoodRatio);
        clusteredTermInfo.add(termInfoCluster);
    }

    Collections.sort(clusteredTermInfo);
    // Cleanup
    Closeables.close(reader, true);
    termEntryMap.clear();

    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
}

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

License:Apache License

private static OpenBitSet getClusterDocBitset(IndexReader reader, Collection<String> idSet, String idField)
        throws IOException {
    int numDocs = reader.numDocs();

    OpenBitSet bitset = new OpenBitSet(numDocs);

    Set<String> idFieldSelector = null;
    if (idField != null) {
        idFieldSelector = new TreeSet<String>();
        idFieldSelector.add(idField);/*w  ww.j  a  v a  2 s  .c  om*/
    }

    for (int i = 0; i < numDocs; i++) {
        String id;
        // Use Lucene's internal ID if idField is not specified. Else, get it from the document.
        if (idField == null) {
            id = Integer.toString(i);
        } else {
            id = reader.document(i, idFieldSelector).get(idField);
        }
        if (idSet.contains(id)) {
            bitset.set(i);
        }
    }
    log.info("Created bitset for in-cluster documents : {}", bitset.cardinality());
    return bitset;
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

private void hashDuplicatesHelper(Path index, String url) throws Exception {
    DeleteDuplicates dedup = new DeleteDuplicates(conf);
    dedup.dedup(new Path[] { index });
    FsDirectory dir = new FsDirectory(fs, new Path(index, "part-0000"), false, conf);
    IndexReader reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }//from   w  ww . j a  v  a 2 s  .  c o  m
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check url", url, doc.get("url"));
        System.out.println(doc);
    }
    reader.close();
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

public void testUrlDuplicates() throws Exception {
    DeleteDuplicates dedup = new DeleteDuplicates(conf);
    dedup.dedup(new Path[] { index2 });
    FsDirectory dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
    IndexReader reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    MD5Hash hash = MD5Hash.digest("2");
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }//ww w. j ava 2s  .  co  m
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check hash", hash.toString(), doc.get("digest"));
        System.out.println(doc);
    }
    reader.close();
}

From source file:org.apache.nutch.indexer.TestDeleteDuplicates.java

License:Apache License

public void testMixedDuplicates() throws Exception {
    DeleteDuplicates dedup = new DeleteDuplicates(conf);
    dedup.dedup(new Path[] { index1, index2 });
    FsDirectory dir = new FsDirectory(fs, new Path(index1, "part-0000"), false, conf);
    IndexReader reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }//from w  w w.jav a2 s  .co m
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check url", "http://www.example.com/2", doc.get("url"));
        System.out.println(doc);
    }
    reader.close();
    dir = new FsDirectory(fs, new Path(index2, "part-0000"), false, conf);
    reader = IndexReader.open(dir);
    assertEquals("only one doc left", reader.numDocs(), 1);
    MD5Hash hash = MD5Hash.digest("2");
    for (int i = 0; i < reader.maxDoc(); i++) {
        if (reader.isDeleted(i)) {
            System.out.println("-doc " + i + " deleted");
            continue;
        }
        Document doc = reader.document(i);
        // make sure we got the right one
        assertEquals("check hash", hash.toString(), doc.get("digest"));
        System.out.println(doc);
    }
    reader.close();
}