Example usage for org.apache.lucene.index IndexReader maxDoc

List of usage examples for org.apache.lucene.index IndexReader maxDoc

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader maxDoc.

Prototype

public abstract int maxDoc();

Source Link

Document

Returns one greater than the largest possible document number.

Usage

From source file:info.boytsov.lucene.DumpIndex.java

License:Open Source License

public static void main(String[] args) {
    if (args.length < 3 || args.length > 8) {
        printUsage();// w w  w. j  av a2 s  .c o m
        System.exit(1);
    }
    boolean sortByURL = Integer.parseInt(args[0]) != 0;

    String srcDirName = args[1];
    String dstFileName = args[2];

    int minTermFreq = MIN_TERM_FREQ;

    if (args.length >= 4)
        minTermFreq = Integer.parseInt(args[3]);

    int maxTermQty = MAX_TERM_QTY;

    if (args.length >= 5)
        maxTermQty = Integer.parseInt(args[4]);

    System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName);
    System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();
        int sortTable[] = new int[docQty];

        Arrays.fill(sortTable, -1);

        if (sortByURL) {
            System.out.println("Re-sorting documents by URL!");

            URL2DocID remap[] = new URL2DocID[docQty];

            for (int docID = 0; docID < docQty; ++docID) {
                Document doc = reader.document(docID);
                String url = doc.get("url");
                remap[docID] = new URL2DocID(url, docID);
                if (docID % 100000 == 0) {
                    System.out.println("Collected " + (docID + 1) + " URLs for re-sorting");
                }
            }

            Arrays.sort(remap);

            System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table.");

            for (int newDocID = 0; newDocID < docQty; ++newDocID) {
                sortTable[remap[newDocID].docID] = newDocID;
                //System.out.println(remap[newDocID].url);
            }

            System.out.println("Sort table is filled up!");

            for (int i = 0; i < docQty; ++i)
                remap[i] = null;
            remap = null;
            System.gc(); // Let's try to free some memory

            /*
             *  Paranoid check: did we change all the -1 to non-negative numbers.
             *  Turned out, it wasn't that paranoid. You may have repeating URLs.
             *  Then, some elements in sortTable remain unset.
             */
            for (int i = 0; i < sortTable.length; ++i) {
                if (sortTable[i] == -1) {
                    throw new Exception("Bug: element " + i + " in sort table is not set");
                }
            }
        } else {
            System.out.println("Keeping the original document order!");

            for (int i = 0; i < sortTable.length; ++i) {
                sortTable[i] = i; // Identity transformation
            }
        }

        FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty);

        File dstFile = new File(dstFileName);

        FileOutputStream outData = new FileOutputStream(dstFile);

        Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator();

        long totalWritten = 0;
        long totalInts = 0;

        int termId = 0;

        int batchWriteSize = 1024 * 1024 * 16;

        /*
         *  We are trying to re-use as many objects as possible,
         *  in order to reduce the number of allocations.
         */
        IntArray bufferArray = new IntArray(batchWriteSize);
        int tmpDocId[] = null;

        ByteBuffer buffer = null;

        while (iter.hasNext()) {
            Entry<TermDesc, Integer> e = iter.next();

            TermDesc ts = e.getKey();
            DocsEnum docIter = dict.getDocIterator(ts.text);

            int postQty = ts.freq;

            int qty = 0, prevDocID = -1;

            /*
             * If posting lists appear in the order of descending term frequencies.,
             * this will be actually only one allocation.
             */
            if (tmpDocId == null || tmpDocId.length < postQty)
                tmpDocId = new int[postQty];

            bufferArray.add(postQty);

            for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) {
                if (i >= postQty) {
                    throw new Exception("Bug: more postings than expected for term: " + ts.getText());
                }
                int currDocID = docIter.docID();
                if (currDocID >= docQty) {
                    throw new Exception("Bug: a document ID " + currDocID
                            + " is out of bounds, total # of docs: " + docQty);
                }
                tmpDocId[i] = sortTable[currDocID];
                if (prevDocID >= docIter.docID()) {
                    throw new Exception("Bug: unsorted doc ids for term: " + ts.getText());
                }
                prevDocID = currDocID;
            }
            if (qty != postQty) {
                throw new Exception("Bug: fewer postings than expected for term: " + ts.getText());
            }
            /*
             *  Now let's resort docIds and write them.
             *  REMEMBER that tmpDocId is a buffer that may contain 
             *  MORE than postQty elements!!!
             *  Some of the won't be used.
             *  
             */
            Arrays.sort(tmpDocId, 0, postQty);

            for (int i = 0; i < postQty; ++i)
                bufferArray.add(tmpDocId[i]);

            totalWritten += 4 * (1 + postQty);
            totalInts += postQty;

            if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) {
                System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: "
                        + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6
                        + " Millions postings");
            }

            if (bufferArray.size() >= batchWriteSize) {
                // WriteArray may produce a new buffer, let's reuse it
                buffer = WriteArray(bufferArray, outData, buffer);
            }

            ++termId;
        }
        System.out.println("Term qty: " + termId + " flat size size : "
                + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings");

        // WriteArray may produce a new buffer, let's reuse it      
        buffer = WriteArray(bufferArray, outData, buffer);
    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:info.boytsov.lucene.GetTotPostQty.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 1) {
        printUsage();/*from   ww  w . ja  v  a 2  s . com*/
        System.exit(1);
    }
    String srcDirName = args[0];

    System.out.println("Source dir: " + srcDirName);

    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

        int docQty = reader.maxDoc();

        Fields fields = MultiFields.getFields(reader);
        Terms terms = fields.terms(FIELD_NAME);

        long totalInts = 0;
        int termQty = 0;

        for (TermsEnum termIter = terms.iterator(null); termIter.next() != null;) {
            totalInts += termIter.docFreq();
            //System.out.println(termQty + " -> " + termIter.docFreq());
            ++termQty;
            if (termQty % 1000000 == 0)
                System.out.println("Read " + termQty + " dictionary terms");
        }

        System.out.println("Term qty: " + termQty + " Doc qty: " + docQty + " postings qty: " + totalInts);

    } catch (Exception e) {
        System.err.println("Error: " + e.getMessage());
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:info.extensiblecatalog.OAIToolkit.oai.dataproviders.LuceneFacadeDataProvider.java

License:Open Source License

synchronized static public void initializeCachedFullHarvest() {
    if (cachedFullHarvestIds == null) {

        IndexReader indexReader;
        try {//w w  w .  j  a v  a 2  s  . c  o m
            indexReader = ApplInfo.luceneSearcher.getIndexReader().clone(true);
        } catch (CorruptIndexException e1) {
            prglog.error("[PRG] " + e1);
            return;
        } catch (IOException e1) {
            prglog.error("[PRG] " + e1);
            return;
        }
        cachedFullHarvestIndexSearcher = new IndexSearcher(indexReader);

        try {
            cachedFullHarvestEarliestDate = TextUtil
                    .luceneToDate(ApplInfo.luceneSearcher.getEarliestDatestamp());
            cachedFullHarvestExpiry = ApplInfo.luceneSearcher.getLatestDatestamp();
        } catch (ParseException pe) {
            prglog.error("[PRG] " + pe);
            return;
        }

        BooleanQuery query = new BooleanQuery();

        // don't include deleted records
        query.add((Query) new TermQuery(new Term("is_deleted", "false")), Occur.MUST);

        // do we need to filter based on orgCode?
        if (ApplInfo.getOrgCodeFilter() != null) {
            query.add((Query) new TermQuery(new Term("repository_code", ApplInfo.getOrgCodeFilter())),
                    Occur.MUST);
        }

        try {
            cachedFullHarvestIds = new BitSet(indexReader.maxDoc());
            cachedFullHarvestIndexSearcher.search(query, new Collector() {
                private int docBase;

                // ignore scorer
                public void setScorer(Scorer scorer) {
                }

                // accept docs out of order (for a BitSet it doesn't matter)
                public boolean acceptsDocsOutOfOrder() {
                    return true;
                }

                public void collect(int doc) {
                    cachedFullHarvestIds.set(doc + docBase);
                }

                public void setNextReader(IndexReader reader, int docBase) {
                    this.docBase = docBase;
                }
            });

            prglog.info("[PRG] Initial Full Harvest Cache created successfully.");

        } catch (IOException e) {
            prglog.error("[PRG] " + e);
            cachedFullHarvestIds = null;
            return;
        }

    }

}

From source file:intelligentWebAlgorithms.algos.search.ranking.DocRankMatrixBuilder.java

License:Apache License

private List<Integer> getProcessedDocs(IndexReader idxR) throws IOException {
    List<Integer> docs = new ArrayList<Integer>();
    for (int i = 0, n = idxR.maxDoc(); i < n; i++) {
        if (idxR.hasDeletions() == false) {
            Document doc = idxR.document(i);
            if (eligibleForDocRank(doc.get("doctype"))) {
                docs.add(i);/*  w ww.  jav  a2  s.  c o m*/
            }
        }
    }
    return docs;

}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * If the result is deterministic we can cache all the docids. All queries can share this
 * cache./*w  ww .  j  av a  2  s  .c o m*/
 */
private ScoreDoc[] buildInternalDocidsCache(SearchArgs args) throws IOException {
    String index = args.axiom_index == null ? args.index : args.axiom_index;
    Path indexPath = Paths.get(index);
    if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
        throw new IllegalArgumentException(index + " does not exist or is not a directory.");
    }
    IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath));
    IndexSearcher searcher = new IndexSearcher(reader);
    if (args.searchtweets) {
        return searcher.search(new FieldValueQuery(TweetGenerator.StatusField.ID_LONG.name), reader.maxDoc(),
                BREAK_SCORE_TIES_BY_TWEETID).scoreDocs;
    }
    return searcher.search(new FieldValueQuery(LuceneDocumentGenerator.FIELD_ID), reader.maxDoc(),
            BREAK_SCORE_TIES_BY_DOCID).scoreDocs;
}

From source file:io.anserini.rerank.lib.AxiomReranker.java

License:Apache License

/**
 * Calculate the scores (weights) of each term that occured in the reranking pool.
 * The Process:/*from   w  w  w  . j  a v a2s  . c  o m*/
 * 1. For each query term, calculate its score for each term in the reranking pool. the score
 * is calcuated as
 * <pre>
 * P(both occurs)*log{P(both occurs)/P(t1 occurs)/P(t2 occurs)}
 * + P(both not occurs)*log{P(both not occurs)/P(t1 not occurs)/P(t2 not occurs)}
 * + P(t1 occurs t2 not occurs)*log{P(t1 occurs t2 not occurs)/P(t1 occurs)/P(t2 not occurs)}
 * + P(t1 not occurs t2 occurs)*log{P(t1 not occurs t2 occurs)/P(t1 not occurs)/P(t2 occurs)}
 * </pre>
 * 2. For each query term the scores of every other term in the reranking pool are stored in a
 * PriorityQueue, only the top {@code K} are kept.
 * 3. Add the scores of the same term together and pick the top {@code M} ones.
 *
 * @param termInvertedList A Map of <term -> Set<docId>> where the Set of docIds is where the term occurs
 * @param context An instance of RerankerContext
 * @return Map<String, Double> Top terms and their weight scores in a HashMap
 */
private Map<String, Double> computeTermScore(Map<String, Set<Integer>> termInvertedList,
        RerankerContext<T> context) throws IOException {
    class ScoreComparator implements Comparator<Pair<String, Double>> {
        public int compare(Pair<String, Double> a, Pair<String, Double> b) {
            int cmp = Double.compare(b.getRight(), a.getRight());
            if (cmp == 0) {
                return a.getLeft().compareToIgnoreCase(b.getLeft());
            } else {
                return cmp;
            }
        }
    }

    // get collection statistics so that we can get idf later on.
    IndexReader reader;
    if (this.externalIndexPath != null) {
        Path indexPath = Paths.get(this.externalIndexPath);
        if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
            throw new IllegalArgumentException(
                    this.externalIndexPath + " does not exist or is not a directory.");
        }
        reader = DirectoryReader.open(FSDirectory.open(indexPath));
    } else {
        IndexSearcher searcher = context.getIndexSearcher();
        reader = searcher.getIndexReader();
    }
    final long docCount = reader.numDocs() == -1 ? reader.maxDoc() : reader.numDocs();

    //calculate the Mutual Information between term with each query term
    List<String> queryTerms = context.getQueryTokens();
    Map<String, Integer> queryTermsCounts = new HashMap<>();
    for (String qt : queryTerms) {
        queryTermsCounts.put(qt, queryTermsCounts.getOrDefault(qt, 0) + 1);
    }

    Set<Integer> allDocIds = new HashSet<>();
    for (Set<Integer> s : termInvertedList.values()) {
        allDocIds.addAll(s);
    }
    int docIdsCount = allDocIds.size();

    // Each priority queue corresponds to a query term: The p-queue itself stores all terms
    // in the reranking pool and their reranking scores to the query term.
    List<PriorityQueue<Pair<String, Double>>> allTermScoresPQ = new ArrayList<>();
    for (Map.Entry<String, Integer> q : queryTermsCounts.entrySet()) {
        String queryTerm = q.getKey();
        long df = reader.docFreq(new Term(LuceneDocumentGenerator.FIELD_BODY, queryTerm));
        if (df == 0L) {
            continue;
        }
        float idf = (float) Math.log((1 + docCount) / df);
        int qtf = q.getValue();
        if (termInvertedList.containsKey(queryTerm)) {
            PriorityQueue<Pair<String, Double>> termScorePQ = new PriorityQueue<>(new ScoreComparator());
            double selfMI = computeMutualInformation(termInvertedList.get(queryTerm),
                    termInvertedList.get(queryTerm), docIdsCount);
            for (Map.Entry<String, Set<Integer>> termEntry : termInvertedList.entrySet()) {
                double score;
                if (termEntry.getKey().equals(queryTerm)) { // The mutual information to itself will always be 1
                    score = idf * qtf;
                } else {
                    double crossMI = computeMutualInformation(termInvertedList.get(queryTerm),
                            termEntry.getValue(), docIdsCount);
                    score = idf * beta * qtf * crossMI / selfMI;
                }
                termScorePQ.add(Pair.of(termEntry.getKey(), score));
            }
            allTermScoresPQ.add(termScorePQ);
        }
    }

    Map<String, Double> aggTermScores = new HashMap<>();
    for (PriorityQueue<Pair<String, Double>> termScores : allTermScoresPQ) {
        for (int i = 0; i < Math.min(termScores.size(), this.K); i++) {
            Pair<String, Double> termScore = termScores.poll();
            String term = termScore.getLeft();
            Double score = termScore.getRight();
            if (score - 0.0 > 1e-8) {
                aggTermScores.put(term, aggTermScores.getOrDefault(term, 0.0) + score);
            }
        }
    }
    PriorityQueue<Pair<String, Double>> termScoresPQ = new PriorityQueue<>(new ScoreComparator());
    for (Map.Entry<String, Double> termScore : aggTermScores.entrySet()) {
        termScoresPQ.add(Pair.of(termScore.getKey(), termScore.getValue() / queryTerms.size()));
    }
    Map<String, Double> resultTermScores = new HashMap<>();
    for (int i = 0; i < Math.min(termScoresPQ.size(), this.M); i++) {
        Pair<String, Double> termScore = termScoresPQ.poll();
        String term = termScore.getKey();
        double score = termScore.getValue();
        resultTermScores.put(term, score);
    }

    return resultTermScores;
}

From source file:io.datalayer.lucene.index.LuceneLifecycleTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(ids.length, reader.maxDoc());
    assertEquals(ids.length, reader.numDocs());
    reader.close();//from   ww  w.j  a v a2 s. co m
}

From source file:io.datalayer.lucene.read.LuceneReaderTest.java

License:Apache License

@Test
public void testReader() throws IOException {
    IndexReader reader = DirectoryReader.open(directory);
    assertEquals(keywords.length, reader.maxDoc());
    assertEquals(keywords.length, reader.numDocs());
    reader.close();/*from   w w  w .  ja  v  a 2  s  . c  o  m*/
}

From source file:ir.project.TFIDFMatrix.java

private void createTermMap() {
    try {/*from w  ww.  j  av a2 s.c  o  m*/
        IndexReader reader = DirectoryReader.open(this.index);

        this.termMap = new HashMap<>(); // Map used to identify position in matrix for 
        this.numDocs = reader.maxDoc();
        int count = 0;

        // Setup the termMap
        for (int i = 0; i < numDocs; i++) {

            Terms vector = reader.getTermVector(i, "text");
            if (vector == null) {
                System.err.println("Vector is null!");
                continue;
            }

            TermsEnum it = vector.iterator();
            while (it.next() != null) {
                Term t = new Term("text", it.term().utf8ToString());

                if (!termMap.containsKey(it.term().utf8ToString())) {
                    termMap.put(it.term().utf8ToString(), count);
                    count += 1;
                }
            }
        }

        this.numTerms = count;
        reader.close();

    } catch (IOException ex) {
        Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:it.doqui.index.ecmengine.business.personalization.multirepository.index.lucene.RepositoryAwareAbstractLuceneIndexerImpl.java

License:Open Source License

/**
 * Delete all index entries which do not start with the goven prefix
 *
 * @param prefix/*  www  .  j  a  v a2 s  .co m*/
 */
public void deleteAll(String prefix) {
    IndexReader mainReader = null;
    try {
        //            mainReader = getReader();
        for (int doc = 0; doc < mainReader.maxDoc(); doc++) {
            if (!mainReader.isDeleted(doc)) {
                Document document = mainReader.document(doc);
                String[] ids = document.getValues("ID");
                if ((prefix == null) || nonStartwWith(ids, prefix)) {
                    deletions.add(ids[ids.length - 1]);
                }
            }
        }

    } catch (IOException e) {
        // If anything goes wrong we try and do a roll back
        throw new LuceneIndexException("Failed to delete all entries from the index", e);
    } finally {
        if (mainReader != null) {
            try {
                mainReader.close();
            } catch (IOException e) {
                throw new LuceneIndexException("Filed to close main reader", e);
            }
        }
    }
}