info.boytsov.lucene.DumpIndex.java Source code

Introduction

Here is the source code for info.boytsov.lucene.DumpIndex.java
Source

/**
 *
 * This code is released under the
 * Apache License Version 2.0 http://www.apache.org/licenses/.
 *  
 */
package info.boytsov.lucene;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map.Entry;

import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.FSDirectory;

import me.lemire.lucene.IntArray;

/**
 * 
 * A simple utility to dump posting lists from the Lucene index,
 * which was previously created by the utility CreateIndex.
 * 
 *  We process only the field "body". 
 *  For each keyword, we first store the number of postings. 
 *  Next, we store the postings (each using 4 bytes). 
 *  
 * @author Leonid Boytsov
 */
public class DumpIndex {
    final public static String FIELD_NAME = "body";
    final public static int MIN_TERM_FREQ = 3;
    final public static int MAX_TERM_QTY = 10000000;

    public static void main(String[] args) {
        if (args.length < 3 || args.length > 8) {
            printUsage();
            System.exit(1);
        }
        boolean sortByURL = Integer.parseInt(args[0]) != 0;

        String srcDirName = args[1];
        String dstFileName = args[2];

        int minTermFreq = MIN_TERM_FREQ;

        if (args.length >= 4)
            minTermFreq = Integer.parseInt(args[3]);

        int maxTermQty = MAX_TERM_QTY;

        if (args.length >= 5)
            maxTermQty = Integer.parseInt(args[4]);

        System.out.println("Source dir: " + srcDirName + " target dir: " + dstFileName);
        System.out.println("Min term freq: " + minTermFreq + " Max # of terms: " + maxTermQty);

        try {
            IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(srcDirName)));

            int docQty = reader.maxDoc();
            int sortTable[] = new int[docQty];

            Arrays.fill(sortTable, -1);

            if (sortByURL) {
                System.out.println("Re-sorting documents by URL!");

                URL2DocID remap[] = new URL2DocID[docQty];

                for (int docID = 0; docID < docQty; ++docID) {
                    Document doc = reader.document(docID);
                    String url = doc.get("url");
                    remap[docID] = new URL2DocID(url, docID);
                    if (docID % 100000 == 0) {
                        System.out.println("Collected " + (docID + 1) + " URLs for re-sorting");
                    }
                }

                Arrays.sort(remap);

                System.out.println("Collected and sorted all URLs for resoring, " + "filling out the sort table.");

                for (int newDocID = 0; newDocID < docQty; ++newDocID) {
                    sortTable[remap[newDocID].docID] = newDocID;
                    //System.out.println(remap[newDocID].url);
                }

                System.out.println("Sort table is filled up!");

                for (int i = 0; i < docQty; ++i)
                    remap[i] = null;
                remap = null;
                System.gc(); // Let's try to free some memory

                /*
                 *  Paranoid check: did we change all the -1 to non-negative numbers.
                 *  Turned out, it wasn't that paranoid. You may have repeating URLs.
                 *  Then, some elements in sortTable remain unset.
                 */
                for (int i = 0; i < sortTable.length; ++i) {
                    if (sortTable[i] == -1) {
                        throw new Exception("Bug: element " + i + " in sort table is not set");
                    }
                }
            } else {
                System.out.println("Keeping the original document order!");

                for (int i = 0; i < sortTable.length; ++i) {
                    sortTable[i] = i; // Identity transformation
                }
            }

            FreqWordDict dict = new FreqWordDict(reader, FIELD_NAME, minTermFreq, maxTermQty);

            File dstFile = new File(dstFileName);

            FileOutputStream outData = new FileOutputStream(dstFile);

            Iterator<Entry<TermDesc, Integer>> iter = dict.getTermIterator();

            long totalWritten = 0;
            long totalInts = 0;

            int termId = 0;

            int batchWriteSize = 1024 * 1024 * 16;

            /*
             *  We are trying to re-use as many objects as possible,
             *  in order to reduce the number of allocations.
             */
            IntArray bufferArray = new IntArray(batchWriteSize);
            int tmpDocId[] = null;

            ByteBuffer buffer = null;

            while (iter.hasNext()) {
                Entry<TermDesc, Integer> e = iter.next();

                TermDesc ts = e.getKey();
                DocsEnum docIter = dict.getDocIterator(ts.text);

                int postQty = ts.freq;

                int qty = 0, prevDocID = -1;

                /*
                 * If posting lists appear in the order of descending term frequencies.,
                 * this will be actually only one allocation.
                 */
                if (tmpDocId == null || tmpDocId.length < postQty)
                    tmpDocId = new int[postQty];

                bufferArray.add(postQty);

                for (int i = 0; docIter.nextDoc() != DocIdSetIterator.NO_MORE_DOCS; ++i, ++qty) {
                    if (i >= postQty) {
                        throw new Exception("Bug: more postings than expected for term: " + ts.getText());
                    }
                    int currDocID = docIter.docID();
                    if (currDocID >= docQty) {
                        throw new Exception("Bug: a document ID " + currDocID
                                + " is out of bounds, total # of docs: " + docQty);
                    }
                    tmpDocId[i] = sortTable[currDocID];
                    if (prevDocID >= docIter.docID()) {
                        throw new Exception("Bug: unsorted doc ids for term: " + ts.getText());
                    }
                    prevDocID = currDocID;
                }
                if (qty != postQty) {
                    throw new Exception("Bug: fewer postings than expected for term: " + ts.getText());
                }
                /*
                 *  Now let's resort docIds and write them.
                 *  REMEMBER that tmpDocId is a buffer that may contain 
                 *  MORE than postQty elements!!!
                 *  Some of the won't be used.
                 *  
                 */
                Arrays.sort(tmpDocId, 0, postQty);

                for (int i = 0; i < postQty; ++i)
                    bufferArray.add(tmpDocId[i]);

                totalWritten += 4 * (1 + postQty);
                totalInts += postQty;

                if (termId % 100000 == 0 || bufferArray.size() >= batchWriteSize) {
                    System.out.println(termId + ":" + ts.getText() + " \t postQty=" + postQty + " overall written: "
                            + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6
                            + " Millions postings");
                }

                if (bufferArray.size() >= batchWriteSize) {
                    // WriteArray may produce a new buffer, let's reuse it
                    buffer = WriteArray(bufferArray, outData, buffer);
                }

                ++termId;
            }
            System.out.println("Term qty: " + termId + " flat size size : "
                    + totalWritten / 1024.0 / 1024.0 / 1024.0 + " Gbs, " + totalInts / 1e6 + " Millions postings");

            // WriteArray may produce a new buffer, let's reuse it      
            buffer = WriteArray(bufferArray, outData, buffer);
        } catch (Exception e) {
            System.err.println("Error: " + e.getMessage());
            e.printStackTrace();
            System.exit(1);
        }

    }

    private static ByteBuffer WriteArray(IntArray bufferArray, FileOutputStream outData, ByteBuffer reuseBuffer)
            throws IOException {
        int newCapacity = 4 * bufferArray.size();
        if (reuseBuffer == null || reuseBuffer.capacity() < newCapacity) {
            reuseBuffer = ByteBuffer.allocate(newCapacity);
            reuseBuffer.order(ByteOrder.LITTLE_ENDIAN);
        }

        reuseBuffer.clear();

        for (int i = 0; i < bufferArray.size(); ++i)
            reuseBuffer.putInt(bufferArray.get(i));

        outData.write(reuseBuffer.array(), 0, newCapacity);

        bufferArray.clear();

        return reuseBuffer;
    }

    private static void printUsage() {
        System.out.println("mvn exec:java " + " -Dexec.mainClass=info.boytsov.lucene.DumpIndex " + " -Dexec.args=\""
                + " <sort by URL: 1:0>" + " <index dir> <output file>"
                + " <optional: min term frequency> <optional: max # of terms>\"");
    }

}