Example usage for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:info.boytsov.lucene.DumpIndex.java

License:Open Source License

public static void main(String[] args) {
    if (args.length < 3 || args.length > 8) {
        printUsage();// w w  w. j  av a2 s  .c o m
        System.exit(1);
    }
    boolean sortByURL = Integer.parseInt(args[0]) != 0;

    String srcDirName = args[1];
    String dstFileName = args[2];

    int minTermFreq = MIN_TERM_FREQ;

    if (args.length >= 4)
        minTermFreq = Integer.parseInt(args[3]);

    int maxTermQty = MAX_TERM_QTY;

    if (args.length >= 5)
        maxTermQty = Integer.parseInt(args[4]);

    System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName);
    System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();
        int sortTable[] = new int[docQty];

        Arrays.fill(sortTable, -1);

        if (sortByURL) {
            System.out.println("Re-sorting documents by URL!");

            URL2DocID remap[] = new URL2DocID[docQty];

            for (int docID = 0; docID < docQty; ++docID) {
                Document doc = reader.document(docID);
                String url = doc.get("url");
                remap[docID] = new URL2DocID(url, docID);
                if (docID % 100000 == 0) {
                    System.out.println("Collected " + (docID + 1) + " URLs for re-sorting");
                }
            }

            Arrays.sort(remap);

            System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table.");

            for (int newDocID = 0; newDocID < docQty; ++newDocID) {
                sortTable[remap[newDocID].docID] = newDocID;
                //System.out.println(remap[newDocID].url);
            }

            System.out.println("Sort table is filled up!");

            for (int i = 0; i < docQty; ++i)
                remap[i] = null;
            remap = null;
            System.gc(); // Let's try to free some memory

            /*
             *  Paranoid check: did we change all the -1 to non-negative numbers.
             *  Turned out, it wasn't that paranoid. You may have repeating URLs.
             *  Then, some elements in sortTable remain unset.
             */
            for (int i = 0; i < sortTable.length; ++i) {
                if (sortTable[i] == -1) {
                    throw new Exception("Bug: element " + i + " in sort table is not set");
                }
            }
        } else {
            System.out.println("Keeping the original document order!");

            for (int i = 0; i < sortTable.length; ++i) {
                sortTable[i] = i; // Identity transformation
            }
        }

        FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty);

        File dstFile = new File(dstFileName);

        FileOutputStream outData = new FileOutputStream(dstFile);

        Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator();

        long totalWritten = 0;
        long totalInts = 0;

        int termId = 0;

        int batchWriteSize = 1024 * 1024 * 16;

        /*
         *  We are trying to re-use as many objects as possible,
         *  in order to reduce the number of allocations.
         */
        IntArray bufferArray = new IntArray(batchWriteSize);
        int tmpDocId[] = null;

        ByteBuffer buffer = null;

        while (iter.hasNext()) {
            Entry<TermDesc, Integer> e = iter.next();

            TermDesc ts = e.getKey();
            DocsEnum docIter = dict.getDocIterator(ts.text);

            int postQty = ts.freq;

            int qty = 0, prevDocID = -1;

            /*
             * If posting lists appear in the order of descending term frequencies.,
             * this will be actually only one allocation.
             */
            if (tmpDocId == null || tmpDocId.length < postQty)
                tmpDocId = new int[postQty];

            bufferArray.add(postQty);

            for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) {
                if (i >= postQty) {
                    throw new Exception("Bug: more postings than expected for term: " + ts.getText());
                }
                int currDocID = docIter.docID();
                if (currDocID >= docQty) {
                    throw new Exception("Bug: a document ID " + currDocID
                            + " is out of bounds, total # of docs: " + docQty);
                }
                tmpDocId[i] = sortTable[currDocID];
                if (prevDocID >= docIter.docID()) {
                    throw new Exception("Bug: unsorted doc ids for term: " + ts.getText());
                }
                prevDocID = currDocID;
            }
            if (qty != postQty) {
                throw new Exception("Bug: fewer postings than expected for term: " + ts.getText());
            }
            /*
             *  Now let's resort docIds and write them.
             *  REMEMBER that tmpDocId is a buffer that may contain 
             *  MORE than postQty elements!!!
             *  Some of the won't be used.
             *  
             */
            Arrays.sort(tmpDocId, 0, postQty);

            for (int i = 0; i < postQty; ++i)
                bufferArray.add(tmpDocId[i]);

            totalWritten += 4 * (1 + postQty);
            totalInts += postQty;

            if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) {
                System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: "
                        + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6
                        + " Millions postings");
            }

            if (bufferArray.size() >= batchWriteSize) {
                // WriteArray may produce a new buffer, let's reuse it
                buffer = WriteArray(bufferArray, outData, buffer);
            }

            ++termId;
        }
        System.out.println("Term qty: " + termId + " flat size size : "
                + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings");

        // WriteArray may produce a new buffer, let's reuse it      
        buffer = WriteArray(bufferArray, outData, buffer);
    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:info.boytsov.lucene.GetTotPostQty.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 1) {
        printUsage();/*from   ww  w . ja  v  a 2  s . com*/
        System.exit(1);
    }
    String srcDirName = args[0];

    System.out.println("Source dir: " + srcDirName);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();

        Fields fields = MultiFields.getFields(reader);
        Terms terms = fields.terms(FIELD_NAME);

        long totalInts = 0;
        int termQty = 0;

        for (TermsEnum termIter = terms.iterator(null); termIter.next() != null;) {
            totalInts += termIter.docFreq();
            //System.out.println(termQty + " -> " + termIter.docFreq());
            ++termQty;
            if (termQty % 1000000 == 0)
                System.out.println("Read " + termQty + " dictionary terms");
        }

        System.out.println("Term qty: " + termQty + " Doc qty: " + docQty + " postings qty: " + totalInts);

    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:info.extensiblecatalog.OAIToolkit.oai.dataproviders.LuceneFacadeDataProvider.java

License:Open Source License

synchronized static public void initializeCachedFullHarvest() {
    if (cachedFullHarvestIds == null) {

        IndexReader indexReader;
        try {//w w  w .  j  a v  a 2  s  . c  o m
            indexReader = ApplInfo.luceneSearcher.getIndexReader().clone(true);
        } catch (CorruptIndexException e1) {
            prglog.error("[PRG] " + e1);
            return;
        } catch (IOException e1) {
            prglog.error("[PRG] " + e1);
            return;
        }
        cachedFullHarvestIndexSearcher = new IndexSearcher(indexReader);

        try {
            cachedFullHarvestEarliestDate = TextUtil
                    .luceneToDate(ApplInfo.luceneSearcher.getEarliestDatestamp());
            cachedFullHarvestExpiry = ApplInfo.luceneSearcher.getLatestDatestamp();
        } catch (ParseException pe) {
            prglog.error("[PRG] " + pe);
            return;
        }

        BooleanQuery query = new BooleanQuery();

        // don't include deleted records
        query.add((Query) new TermQuery(new Term("is_deleted", "false")), Occur.MUST);

        // do we need to filter based on orgCode?
        if (ApplInfo.getOrgCodeFilter() != null) {
            query.add((Query) new TermQuery(new Term("repository_code", ApplInfo.getOrgCodeFilter())),
                    Occur.MUST);
        }

        try {
            cachedFullHarvestIds = new BitSet(indexReader.maxDoc());
            cachedFullHarvestIndexSearcher.search(query, new Collector() {
                private int docBase;

                // ignore scorer
                public void setScorer(Scorer scorer) {
                }

                // accept docs out of order (for a BitSet it doesn't matter)
                public boolean acceptsDocsOutOfOrder() {
                    return true;
                }

                public void collect(int doc) {
                    cachedFullHarvestIds.set(doc + docBase);
                }

                public void setNextReader(IndexReader reader, int docBase) {
                    this.docBase = docBase;
                }
            });

            prglog.info("[PRG] Initial Full Harvest Cache created successfully.");

        } catch (IOException e) {
            prglog.error("[PRG] " + e);
            cachedFullHarvestIds = null;
            return;
        }

    }

}

From source file:intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java

License:Apache License

private List<Integer> getProcessedDocs(IndexReader idxR) throws IOException {
    List<Integer> docs = new ArrayList<Integer>();
    for (int i = 0, n = idxR.maxDoc(); i < n; i++) {
        if (idxR.hasDeletions() == false) {
            Document doc = idxR.document(i);
            if (eligibleForDocRank(doc.get("doctype"))) {
                docs.add(i);/*  w ww.  jav  a2  s.  c o m*/
            }
        }
    }
    return docs;

}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * If the result is deterministic we can cache all the docids. All queries can share this
 * cache./*w  ww .  j  av a  2  s  .c o m*/
 */
private ScoreDoc[] buildInternalDocidsCache(SearchArgs args) throws IOException {
    String index = args.axiom_index == null ? args.index : args.axiom_index;
    Path indexPath = Paths.get(index);
    if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
        throw new IllegalArgumentException(index + " does not exist or is not a directory.");
    }
    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
    IndexSearcher searcher = new IndexSearcher(reader);
    if (args.searchtweets) {
        return searcher.search(new FieldValueQuery(TweetGenerator.StatusField.ID_LONG.name), reader.maxDoc(),
                BREAK_SCORE_TIES_BY_TWEETID).scoreDocs;
    }
    return searcher.search(new FieldValueQuery(LuceneDocumentGenerator.FIELD_ID), reader.maxDoc(),
            BREAK_SCORE_TIES_BY_DOCID).scoreDocs;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Calculate the scores (weights) of each term that occured in the reranking pool.
 * The Process:/*from   w  w  w  . j  a v a2s  . c  o m*/
 * 1. For each query term, calculate its score for each term in the reranking pool. the score
 * is calcuated as
 * <pre>
 * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)}
 * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)}
 * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)}
 * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)}
 * </pre>
 * 2. For each query term the scores of every other term in the reranking pool are stored in a
 * PriorityQueue, only the top {@code K} are kept.
 * 3. Add the scores of the same term together and pick the top {@code M} ones.
 *
 * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs
 * @param context An instance of RerankerContext
 * @return Map<String, Double> Top terms and their weight scores in a HashMap
 */
private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList,
        RerankerContext<T> context) throws IOException {
    class ScoreComparator implements Comparator<Pair<String, Double>> {
        public int compare(Pair<String, Double> a, Pair<String, Double> b) {
            int cmp = Double.compare(b.getRight(), a.getRight());
            if (cmp == 0) {
                return a.getLeft().compareToIgnoreCase(b.getLeft());
            } else {
                return cmp;
            }
        }
    }

    // get collection statistics so that we can get idf later on.
    IndexReader reader;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
    } else {
        IndexSearcher searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs();

    //calculate the Mutual Information between term with each query term
    List<String> queryTerms = context.getQueryTokens();
    Map<String, Integer> queryTermsCounts = new HashMap<>();
    for (String qt : queryTerms) {
        queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1);
    }

    Set<Integer> allDocIds = new HashSet<>();
    for (Set<Integer> s : termInvertedList.values()) {
        allDocIds.addAll(s);
    }
    int docIdsCount = allDocIds.size();

    // Each priority queue corresponds to a query term: The p-queue itself stores all terms
    // in the reranking pool and their reranking scores to the query term.
    List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>();
    for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) {
        String queryTerm = q.getKey();
        long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm));
        if (df == 0L) {
            continue;
        }
        float idf = (float) Math.log((1 + docCount) / df);
        int qtf = q.getValue();
        if (termInvertedList.containsKey(queryTerm)) {
            PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator());
            double selfMI = computeMutualInformation(termInvertedList.get(queryTerm),
                    termInvertedList.get(queryTerm), docIdsCount);
            for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) {
                double score;
                if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1
                    score = idf * qtf;
                } else {
                    double crossMI = computeMutualInformation(termInvertedList.get(queryTerm),
                            termEntry.getValue(), docIdsCount);
                    score = idf * beta * qtf * crossMI / selfMI;
                }
                termScorePQ.add(Pair.of(termEntry.getKey(), score));
            }
            allTermScoresPQ.add(termScorePQ);
        }
    }

    Map<String, Double> aggTermScores = new HashMap<>();
    for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) {
        for (int i = 0; i < Math.min(termScores.size(), this.K); i++) {
            Pair<String, Double> termScore = termScores.poll();
            String term = termScore.getLeft();
            Double score = termScore.getRight();
            if (score - 0.0 > 1e-8) {
                aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score);
            }
        }
    }
    PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator());
    for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) {
        termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size()));
    }
    Map<String, Double> resultTermScores = new HashMap<>();
    for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) {
        Pair<String, Double> termScore = termScoresPQ.poll();
        String term = termScore.getKey();
        double score = termScore.getValue();
        resultTermScores.put(term, score);
    }

    return resultTermScores;
}

From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc());
    assertEquals(ids.length, reader.numDocs());
    reader.close();//from   ww  w.j  a v a2 s. co m
}

From source file:io.datalayer.lucene.read.LuceneReaderTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(keywords.length, reader.maxDoc());
    assertEquals(keywords.length, reader.numDocs());
    reader.close();/*from   w w  w .  ja  v  a 2  s  . c  o  m*/
}

From source file:ir.project.TFIDFMatrix.java

private void createTermMap() {
    try {/*from w  ww.  j  av a2 s.c  o  m*/
        IndexReader reader = DirectoryReader.open(this.index);

        this.termMap = new HashMap<>(); // Map used to identify position in matrix for 
        this.numDocs = reader.maxDoc();
        int count = 0;

        // Setup the termMap
        for (int i = 0; i < numDocs; i++) {

            Terms vector = reader.getTermVector(i, "text");
            if (vector == null) {
                System.err.println("Vector is null!");
                continue;
            }

            TermsEnum it = vector.iterator();
            while (it.next() != null) {
                Term t = new Term("text", it.term().utf8ToString());

                if (!termMap.containsKey(it.term().utf8ToString())) {
                    termMap.put(it.term().utf8ToString(), count);
                    count += 1;
                }
            }
        }

        this.numTerms = count;
        reader.close();

    } catch (IOException ex) {
        Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java

License:Open Source License

/**
 * Delete all index entries which do not start with the goven prefix
 *
 * @param prefix/*  www  .  j  a  v a2 s  .co m*/
 */
public void deleteAll(String prefix) {
    IndexReader mainReader = null;
    try {
        //            mainReader = getReader();
        for (int doc = 0; doc < mainReader.maxDoc(); doc++) {
            if (!mainReader.isDeleted(doc)) {
                Document document = mainReader.document(doc);
                String[] ids = document.getValues("ID");
                if ((prefix == null) || nonStartwWith(ids, prefix)) {
                    deletions.add(ids[ids.length - 1]);
                }
            }
        }

    } catch (IOException e) {
        // If anything goes wrong we try and do a roll back
        throw new LuceneIndexException("Failed to delete all entries from the index", e);
    } finally {
        if (mainReader != null) {
            try {
                mainReader.close();
            } catch (IOException e) {
                throw new LuceneIndexException("Filed to close main reader", e);
            }
        }
    }
}