org.archive.tnh.FieldCachePreAllocated.java Source code

Java tutorial

Introduction

Here is the source code for org.archive.tnh.FieldCachePreAllocated.java

Source

/**
 * Copyright 2010 Internet Archive
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package org.archive.tnh;

import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.HashMap;
import java.io.IOException;
import java.io.File;

import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.MMapDirectory;

public class FieldCachePreAllocated implements FieldCache {
    final String fieldName;
    final String[] fieldValueByDocId;

    // For sanity checking.
    int expectedNumberOfReaders;
    Map<IndexReader, Integer> readerDocBases;

    public FieldCachePreAllocated(String fieldName, int expectedNumberOfReaders, int totalNumberOfDocuments) {
        //this.fieldName = fieldName.intern();
        this.fieldName = fieldName;
        this.expectedNumberOfReaders = expectedNumberOfReaders;

        // This is the big memory alloc as totalNumberOfDocuments can be > 250 million.
        this.fieldValueByDocId = new String[totalNumberOfDocuments];

        this.readerDocBases = Collections
                .synchronizedMap(new HashMap<IndexReader, Integer>(expectedNumberOfReaders));
    }

    public String[] getFieldCache(IndexReader reader, int docBase) throws IOException {
        if (this.readerDocBases.size() > this.expectedNumberOfReaders) {
            // TODO: Sanity check.  More IndexReaders than we expect.
        }

        synchronized (this) {
            Integer expectedDocBase = this.readerDocBases.get(reader);
            if (expectedDocBase == null) {
                // Load the terms and fill the array.
                TermDocs termDocs = reader.termDocs();
                TermEnum termEnum = reader.terms(new Term(this.fieldName));
                try {
                    do {
                        Term term = termEnum.term();
                        if (term == null || !this.fieldName.equals(term.field()))
                            break;
                        //String termval = term.text().intern();
                        String termval = term.text();
                        termDocs.seek(termEnum);
                        while (termDocs.next()) {
                            this.fieldValueByDocId[termDocs.doc() + docBase] = termval;
                        }
                    } while (termEnum.next());
                } finally {
                    termDocs.close();
                    termEnum.close();
                }

                expectedDocBase = docBase;

                this.readerDocBases.put(reader, docBase);
            }

            if (expectedDocBase != docBase) {
                // TODO: Sanity check.  The docBase has changed.
            }
        }

        return this.fieldValueByDocId;
    }

    public String getValue(IndexReader reader, int docBase, int docId) throws IOException {
        String[] cache = this.getFieldCache(reader, docBase);

        return cache[docBase + docId];
    }

    public static void main(String args[]) throws Exception {
        if (args.length == 0) {
            System.err.println("FieldCachePreAllocated: <index...>");
            System.exit(1);
        }

        java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length);
        int totalNumDocuments = 0;
        for (String arg : args) {
            try {
                IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

                totalNumDocuments += reader.numDocs();

                readers.add(reader);
            } catch (IOException ioe) {
                System.err.println("Error reading: " + arg);
            }
        }

        FieldCachePreAllocated siteCache = new FieldCachePreAllocated("site", readers.size(), totalNumDocuments);

        int docBase = 0;
        for (IndexReader reader : readers) {
            siteCache.getFieldCache(reader, docBase);

            docBase += reader.numDocs();
        }

        for (Map.Entry<IndexReader, Integer> e : siteCache.readerDocBases.entrySet()) {
            IndexReader reader = e.getKey();
            docBase = e.getValue();
            int numDocs = reader.numDocs();

            System.out.println("Index: " + reader);
            System.out.println("  numDocs: " + numDocs);
            System.out.println("  docBase: " + docBase);

            String[] sitePerDoc = siteCache.getFieldCache(reader, docBase);
            for (int i = 0; i < numDocs; i++) {
                System.out.println("  doc[" + i + "]: " + sitePerDoc[i + docBase]);
            }
        }
    }

}