org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer.java Source code

Introduction

Here is the source code for org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer.java
Source

/*
 * Terrier - Terabyte Retriever 
 * Webpage: http://terrier.org 
 * Contact: terrier{a.}dcs.gla.ac.uk
 * University of Glasgow - School of Computing Science
 * http://www.gla.ac.uk/
 * 
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.1 (the "License"); you may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 *
 * The Original Code is SortAscendingTripleVectors.java.
 *
 * The Original Code is Copyright (C) 2004-2014 the University of Glasgow.
 * All Rights Reserved.
 *
 * Contributor(s):
 *  Craig Macdonald <craigm{a.}dcs.gla.ac.uk
 *  Richard McCreadie <richardm{a.}dcs.gla.ac.uk
 */

package org.terrier.indexing.hadoop;

import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;

import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TaskAttemptID;

import org.terrier.compression.BitIn;
import org.terrier.compression.BitOutputStream;
import org.terrier.indexing.BasicSinglePassIndexer;
import org.terrier.indexing.Document;
import org.terrier.structures.BasicLexiconEntry;
import org.terrier.structures.DocumentIndexEntry;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.FieldDocumentIndexEntry;
import org.terrier.structures.FieldLexiconEntry;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.SimpleDocumentIndexEntry;
import org.terrier.structures.indexing.CompressingMetaIndexBuilder;
import org.terrier.structures.indexing.DocumentIndexBuilder;
import org.terrier.structures.indexing.MetaIndexBuilder;
import org.terrier.structures.indexing.singlepass.FieldPostingInRun;
import org.terrier.structures.indexing.singlepass.RunsMerger;
import org.terrier.structures.indexing.singlepass.SimplePostingInRun;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunIteratorFactory;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter;
import org.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger;
import org.terrier.structures.indexing.singlepass.hadoop.IDComparator;
import org.terrier.structures.indexing.singlepass.hadoop.MapData;
import org.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList;
import org.terrier.structures.indexing.singlepass.hadoop.SplitAwareWrapper;
import org.terrier.structures.indexing.singlepass.hadoop.SplitEmittedTerm;
import org.terrier.utility.ApplicationSetup;
import org.terrier.utility.ArrayUtils;
import org.terrier.utility.FieldScore;
import org.terrier.utility.Files;
import org.terrier.utility.TerrierTimer;
import org.terrier.utility.io.HadoopPlugin;
import org.terrier.utility.io.HadoopUtility;
import org.terrier.utility.io.WrappedIOException;
import org.terrier.utility.io.HadoopPlugin.JobFactory;

/**
 * Single Pass MapReduce indexer. 
 * <p><h3>Map phase processing</h3>
 * Indexes as a Map task, taking in a series of documents, emitting posting lists for terms as
 * memory becomes exhausted. Two side-files are created for each map task: the first (run files) takes note of how many documents were indexed
 * for each flush and for each map; the second contains the statistics for each document in a minature document index
 * </p>
 * <p><h3>Reduce phase processing</h3>
 * All posting lists for each term are read in, one term at a time. Using the run files, the posting lists are output into the final inverted
 * file, with all document ids corrected. Lastly, when all terms have been processed, the document indexes are merged into the final document
 * index, and the lexicon hash and lexid created.
 * </p>
 * <p><h3>Partitioned Reduce processing</h3>
 * Normally, the MapReduce indexer is used with a single reducer. However, if the partitioner is used, multiple reduces can run concurrently,
 * building several final indices. In doing so, a large collection can be indexed into several output indices, which may be useful for distributed
 * retrieval.
 * </p>
 * @author Richard McCreadie and Craig Macdonald
 * @since 2.2
  */
@SuppressWarnings("deprecation")
public class Hadoop_BasicSinglePassIndexer extends BasicSinglePassIndexer
        implements Mapper<Text, SplitAwareWrapper<Document>, SplitEmittedTerm, MapEmittedPostingList>,
        Reducer<SplitEmittedTerm, MapEmittedPostingList, Object, Object> {
    /**
     * main
     * @param args
     * @throws Exception
     */
    public static void main(String[] args) throws Exception {
        if (args.length == 2 && args[0].equals("--finish")) {
            final JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
            if (jf == null)
                throw new Exception("Could not get JobFactory from HadoopPlugin");
            try {
                finish(ApplicationSetup.TERRIER_INDEX_PATH, Integer.parseInt(args[1]), jf);
            } catch (Exception e) {
                logger.error("Couldn't finish index", e);
            } finally {
                jf.close();
            }
        } else {
            System.err.println("Usage: Hadoop_BasicSinglePassIndexer [--finish numberOfReduceTasks]");
        }
    }

    /**
     * finish
     * @param destinationIndexPath
     * @param numberOfReduceTasks
     * @param jf
     * @throws Exception
     */
    public static void finish(final String destinationIndexPath, int numberOfReduceTasks, final JobFactory jf)
            throws Exception {
        final String[] reverseMetaKeys = ApplicationSetup.getProperty("indexer.meta.reverse.keys", "docno")
                .split("\\s*,\\s*");
        Index.setIndexLoadingProfileAsRetrieval(false);
        if (numberOfReduceTasks == 1) {
            Index index = Index.createIndex(destinationIndexPath, ApplicationSetup.TERRIER_INDEX_PREFIX);
            if (index == null) {
                throw new IOException("No such index [" + destinationIndexPath + ","
                        + ApplicationSetup.TERRIER_INDEX_PREFIX + "]");
            }
            CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
            index.close();
            return;
        }
        //make a list of MR jobs in separate threads
        List<Thread> threads = new ArrayList<Thread>(numberOfReduceTasks);
        for (int i = 0; i < numberOfReduceTasks; i++) {
            final int id = i;
            threads.add(new Thread() {
                @Override
                public void run() {
                    try {
                        Index index = Index.createIndex(destinationIndexPath,
                                ApplicationSetup.TERRIER_INDEX_PREFIX + "-" + id);
                        CompressingMetaIndexBuilder.reverseAsMapReduceJob(index, "meta", reverseMetaKeys, jf);
                        index.close();
                    } catch (Exception e) {
                        logger.error("Problem finishing meta", e);
                        e.printStackTrace();
                    }
                }
            });
        }
        //start the threads
        for (Thread t : threads)
            t.start();
        //wait for the threads to end
        for (Thread t : threads)
            t.join();
    }

    static enum Counters {
        INDEXED_DOCUMENTS, INDEXED_EMPTY_DOCUMENTS, INDEXER_FLUSHES, INDEXED_TOKENS, INDEXED_POINTERS;
    };

    /** JobConf of the current running job */
    protected JobConf jc;

    /** The split that these documents came form **/
    protected int splitnum;
    protected boolean start;

    /**
     * Empty constructor. 
     */
    public Hadoop_BasicSinglePassIndexer() {
        super(0, 0, 0);
        numberOfDocuments = currentId = numberOfDocsSinceCheck = numberOfDocsSinceFlush = numberOfUniqueTerms = 0;
        numberOfTokens = numberOfPointers = 0;
        flushNo = 0;
        flushList = new LinkedList<Integer>();
    }

    /** Configure this indexer. Firstly, loads ApplicationSetup appropriately. 
     * Actual configuration of indexer is then handled by configureMap() or configureReduce()
     * depending on whether a Map or Reduce task is being configured.
     * @param _jc The configuration for the job
     */
    public void configure(JobConf _jc) {
        this.jc = _jc;

        //1. configure application
        try {
            HadoopUtility.loadTerrierJob(_jc);
        } catch (Exception e) {
            throw new Error("Cannot load ApplicationSetup", e);
        }

        //2. configurure indexer
        try {
            if (HadoopUtility.isMap(_jc)) {
                configureMap();
            } else {
                configureReduce();
            }
        } catch (Exception e) {
            throw new Error("Cannot configure indexer", e);
        }
    }

    /** Called when the Map or Reduce task ends, to finish up the indexer. Actual cleanup is
     * handled by closeMap() or closeReduce() depending on whether this is a Map or Reduce task.
     */
    public void close() throws IOException {
        if (HadoopUtility.isMap(jc)) {
            closeMap();
        } else {
            closeReduce();
        }
    }

    @Override
    /** Hadoop indexer does not have the consideration of boundary documents. */
    protected void load_builder_boundary_documents() {
    }

    /* ==============================================================
     * Map implementation from here down
     * ==============================================================
     */

    /** output collector for the current map indexing process */
    protected OutputCollector<SplitEmittedTerm, MapEmittedPostingList> outputPostingListCollector;

    /** Current map number */
    protected String mapTaskID;
    /** How many flushes have we made */
    protected int flushNo;

    /** OutputStream for the the data on the runs (runNo, flushes etc) */
    protected DataOutputStream RunData;
    /** List of how many documents are in each flush we have made */
    protected LinkedList<Integer> flushList;

    protected void configureMap() throws Exception {
        super.init();
        Path indexDestination = FileOutputFormat.getWorkOutputPath(jc);
        Files.mkdir(indexDestination.toString());
        mapTaskID = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().toString();
        currentIndex = Index.createNewIndex(indexDestination.toString(), mapTaskID);
        maxMemory = Long.parseLong(ApplicationSetup.getProperty("indexing.singlepass.max.postings.memory", "0"));
        //during reduce, we dont want to load indices into memory, as we only use
        //them as streams
        currentIndex.setIndexProperty("index.preloadIndices.disabled", "true");
        RunData = new DataOutputStream(
                Files.writeFileStream(new Path(indexDestination, mapTaskID + ".runs").toString()));
        RunData.writeUTF(mapTaskID);
        start = true;
        createMemoryPostings();
        super.emptyDocIndexEntry = new SimpleDocumentIndexEntry();
        super.docIndexBuilder = new DocumentIndexBuilder(currentIndex, "document");
        super.metaBuilder = createMetaIndexBuilder();
        emptyDocIndexEntry = (FieldScore.FIELDS_COUNT > 0) ? new FieldDocumentIndexEntry(FieldScore.FIELDS_COUNT)
                : new SimpleDocumentIndexEntry();
    }

    protected MetaIndexBuilder createMetaIndexBuilder() {
        final String[] forwardMetaKeys = ApplicationSetup.getProperty("indexer.meta.forward.keys", "docno")
                .split("\\s*,\\s*");
        final int[] metaKeyLengths = parseInts(
                ApplicationSetup.getProperty("indexer.meta.forward.keylens", "20").split("\\s*,\\s*"));
        //no reverse metadata during main indexing, pick up as separate job later
        return new CompressingMetaIndexBuilder(currentIndex, forwardMetaKeys, metaKeyLengths, new String[0]);
    }

    @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "DM_GC", justification = "Forcing GC is an essential part of releasing"
            + "memory for further indexing")
    /** causes the posting lists built up in memory to be flushed out */
    protected void forceFlush() throws IOException {
        logger.info("Map " + mapTaskID + ", flush requested, containing " + numberOfDocsSinceFlush
                + " documents, flush " + flushNo);
        if (mp == null)
            throw new IOException("Map flushed before any documents were indexed");
        mp.finish(new HadoopRunWriter(outputPostingListCollector, mapTaskID, splitnum, flushNo));
        RunData.writeInt(currentId);
        if (currentReporter != null)
            currentReporter.incrCounter(Counters.INDEXER_FLUSHES, 1);
        System.gc();
        createMemoryPostings();
        memoryCheck.reset();
        numberOfDocsSinceFlush = 0;
        currentId = 0;
        flushNo++;
    }

    /**
     * Map processes a single document. Stores the terms in the document along with the posting list
     * until memory is full or all documents in this map have been processed then writes then to
     * the output collector.  
     * @param key - Wrapper for Document Number
     * @param value - Wrapper for Document Object
     * @param _outputPostingListCollector Collector for emitting terms and postings lists
     * @throws IOException
     */
    public void map(Text key, SplitAwareWrapper<Document> value,
            OutputCollector<SplitEmittedTerm, MapEmittedPostingList> _outputPostingListCollector, Reporter reporter)
            throws IOException {
        final String docno = key.toString();
        currentReporter = reporter;
        reporter.setStatus("Currently indexing " + docno);
        final Document doc = value.getObject();

        if (start) {
            splitnum = value.getSplitIndex();
            System.out.println(splitnum);
            //RunData.writeInt(splitnum);
            start = false;
        }

        this.outputPostingListCollector = _outputPostingListCollector;

        /* setup for parsing */
        createDocumentPostings();
        String term;//term we're currently processing
        numOfTokensInDocument = 0;
        //numberOfDocuments++;
        //get each term in the document
        while (!doc.endOfDocument()) {
            reporter.progress();
            if ((term = doc.getNextTerm()) != null && !term.equals("")) {
                termFields = doc.getFields();
                /* pass term into TermPipeline (stop, stem etc) */
                pipeline_first.processTerm(term);

                /* the term pipeline will eventually add the term to this object. */
            }
            if (MAX_TOKENS_IN_DOCUMENT > 0 && numOfTokensInDocument > MAX_TOKENS_IN_DOCUMENT)
                break;
        }

        //if we didn't index all tokens from document,
        //we need tocurrentId get to the end of the document.
        while (!doc.endOfDocument()) {
            doc.getNextTerm();
        }
        /* we now have all terms in the DocumentTree, so we save the document tree */
        if (termsInDocument
                .getDocumentLength() == 0) { /* this document is empty, add the minimum to the document index */
            // Nothing in the ifile
            indexEmpty(doc.getAllProperties());
        } else { /* index this document */
            try {
                indexDocument(doc.getAllProperties(), termsInDocument);
                numberOfTokens += numOfTokensInDocument;
                reporter.incrCounter(Counters.INDEXED_TOKENS, numOfTokensInDocument);
                reporter.incrCounter(Counters.INDEXED_POINTERS, termsInDocument.getNumberOfPointers());
            } catch (IOException ioe) {
                throw ioe;
            } catch (Exception e) {
                throw new WrappedIOException(e);
            }
        }
        termsInDocument.clear();
        reporter.incrCounter(Counters.INDEXED_DOCUMENTS, 1);
    }

    protected Reporter currentReporter;

    /**
     * Write the empty document to the inverted index
     */
    protected void indexEmpty(final Map<String, String> docProperties) throws IOException {
        /* add doc to documentindex, even though it's empty */
        if (IndexEmptyDocuments) {
            logger.warn("Adding empty document " + docProperties.get("docno"));
            docIndexBuilder.addEntryToBuffer(emptyDocIndexEntry);
            metaBuilder.writeDocumentEntry(docProperties);
            currentId++;
            numberOfDocuments++;
            currentReporter.incrCounter(Counters.INDEXED_EMPTY_DOCUMENTS, 1);
        }
    }

    /** Finish up the map processing. Forces a flush, then writes out the final run data */
    protected void closeMap() throws IOException {
        forceFlush();
        docIndexBuilder.finishedCollections();
        currentIndex.setIndexProperty("index.inverted.fields.count", "" + FieldScore.FIELDS_COUNT);
        if (FieldScore.FIELDS_COUNT > 0) {
            currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(),
                    "java.lang.String", "${index.inverted.fields.count}");
        } else {
            currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "",
                    "");
        }
        metaBuilder.close();
        currentIndex.flush();
        currentIndex.close();
        RunData.writeInt(-1);
        RunData.writeInt(numberOfDocuments);
        RunData.writeInt(splitnum);
        RunData.close();
        logger.info("Map " + mapTaskID + " finishing, indexed " + numberOfDocuments + " in " + (flushNo - 1)
                + " flushes");
    }

    /* ==============================================================
     * Reduce implementation from here down
     * ==============================================================
     */

    /** OutputStream for the Lexicon*/
    protected LexiconOutputStream<String> lexstream;
    /** runIterator factory being used to generate RunIterators */
    protected HadoopRunIteratorFactory runIteratorF = null;
    /** records whether the reduce() has been called for the first time */
    protected boolean reduceStarted = false;

    protected boolean mutipleIndices = true;
    protected int reduceId;
    protected String[] MapIndexPrefixes = null;
    protected Reporter lastReporter = null;

    protected void configureReduce() throws Exception {
        super.init();
        start = true;
        //load in the current index
        final Path indexDestination = FileOutputFormat.getWorkOutputPath(jc);
        Files.mkdir(path = indexDestination.toString());
        final String indexDestinationPrefix = jc.get("indexing.hadoop.prefix", "data");
        reduceId = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().getId();
        indexDestination.toString();
        mutipleIndices = jc.getBoolean("indexing.hadoop.multiple.indices", true);
        if (jc.getNumReduceTasks() > 1) {
            //gets the reduce number and suffices this to the index prefix
            prefix = indexDestinationPrefix + "-" + reduceId;
        } else {
            prefix = indexDestinationPrefix;
        }

        currentIndex = Index.createNewIndex(path, prefix);

        super.merger = createtheRunMerger();
        reduceStarted = false;
    }

    protected LinkedList<MapData> loadRunData() throws IOException {
        // Load in Run Data
        ArrayList<String> mapTaskIDs = new ArrayList<String>();
        final LinkedList<MapData> runData = new LinkedList<MapData>();
        DataInputStream runDataIn;

        final String jobId = TaskAttemptID.forName(jc.get("mapred.task.id")).getJobID().toString().replaceAll("job",
                "task");

        final FileStatus[] files = FileSystem.get(jc).listStatus(FileOutputFormat.getOutputPath(jc),
                new org.apache.hadoop.fs.PathFilter() {
                    public boolean accept(Path path) {
                        final String name = path.getName();
                        //1. is this a run file
                        if (!(name.startsWith(jobId) && name.endsWith(".runs")))
                            return false;
                        return true;
                    }
                });

        if (files == null || files.length == 0) {
            throw new IOException("No run status files found in " + FileOutputFormat.getOutputPath(jc));
        }

        final int thisPartition = TaskAttemptID.forName(jc.get("mapred.task.id")).getTaskID().getId();
        final SplitEmittedTerm.SETPartitioner partitionChecker = new SplitEmittedTerm.SETPartitioner();
        partitionChecker.configure(jc);

        MapData tempHRD;
        for (FileStatus file : files) {
            logger.info("Run data file " + file.getPath().toString() + " has length "
                    + Files.length(file.getPath().toString()));
            runDataIn = new DataInputStream(Files.openFileStream(file.getPath().toString()));
            tempHRD = new MapData(runDataIn);
            //check to see if this file contaned our split information
            if (mutipleIndices && partitionChecker.calculatePartition(tempHRD.getSplitnum(),
                    jc.getNumReduceTasks()) != thisPartition)
                continue;

            mapTaskIDs.add(tempHRD.getMap());
            runData.add(tempHRD);
            runDataIn.close();
        }
        // Sort by splitnum
        Collections.sort(runData);
        Collections.sort(mapTaskIDs, new IDComparator(runData));
        // A list of the index shards
        MapIndexPrefixes = mapTaskIDs.toArray(new String[0]);
        return runData;
    }

    /**
     * Merge the postings for the current term, converts the document ID's in the
     * postings to be relative to one another using the run number, number of documents
     * covered in each run, the flush number for that run and the number of documents
     * flushed.
     * @param mapData - info about the runs(maps) and the flushes
     */
    public void startReduce(LinkedList<MapData> mapData) throws IOException {
        logger.info("The number of Reduce Tasks being used : " + jc.getNumReduceTasks());
        ((HadoopRunsMerger) (super.merger)).beginMerge(mapData);
        this.currentIndex.setIndexProperty("max.term.length",
                ApplicationSetup.getProperty("max.term.length", "" + 20));
        lexstream = new FSOMapFileLexiconOutputStream(this.currentIndex, "lexicon",
                (FieldScore.FIELDS_COUNT > 0 ? FieldLexiconEntry.Factory.class : BasicLexiconEntry.Factory.class));
        // Tell the merger how many to Reducers to merge for
        ((HadoopRunsMerger) merger).setNumReducers(mutipleIndices ? jc.getNumReduceTasks() : 1);
    }

    /** Main reduce algorithm step. Called for every term in the merged index, together with accessors
     * to the posting list information that has been written.
     * This reduce has no output.
     * @param Term indexing term which we are reducing the posting lists into
     * @param postingIterator Iterator over the temporary posting lists we have for this term
     * @param output Unused output collector
     * @param reporter Used to report progress
     */
    public void reduce(SplitEmittedTerm Term, Iterator<MapEmittedPostingList> postingIterator,
            OutputCollector<Object, Object> output, Reporter reporter) throws IOException {
        //if (logger.isDebugEnabled()) logger.debug("Reduce for term "+Term.getText());
        reporter.setStatus("Reducer is merging term " + Term.getTerm());
        if (!reduceStarted) {
            final LinkedList<MapData> runData = loadRunData();
            startReduce(runData);
            reduceStarted = true;
        }
        String term = Term.getTerm().trim();
        if (term.length() == 0)
            return;
        runIteratorF.setRunPostingIterator(postingIterator);
        runIteratorF.setTerm(term);
        try {
            merger.mergeOne(lexstream);
        } catch (Exception e) {
            throw new WrappedIOException(e);
        }
        reporter.progress();
        this.lastReporter = reporter;
    }

    /** Merges the simple document indexes made for each map, instead creating the final document index */
    @SuppressWarnings("unchecked")
    protected void mergeDocumentIndex(Index[] src, int numdocs) throws IOException {
        logger.info("Merging document and meta indices");
        final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(currentIndex, "document");
        final MetaIndexBuilder metaBuilder = this.createMetaIndexBuilder();
        //int i_index = 0;
        int docCount = -1;
        TerrierTimer tt = new TerrierTimer("Merging document & meta indices", numdocs);
        tt.start();
        try {
            for (Index srcIndex : src) {
                final Iterator<DocumentIndexEntry> docidInput = (Iterator<DocumentIndexEntry>) srcIndex
                        .getIndexStructureInputStream("document");
                final Iterator<String[]> metaInput1 = (Iterator<String[]>) srcIndex
                        .getIndexStructureInputStream("meta");
                while (docidInput.hasNext()) {
                    docCount++;
                    docidOutput.addEntryToBuffer(docidInput.next());
                    metaBuilder.writeDocumentEntry(metaInput1.next());
                    this.lastReporter.progress();
                    tt.increment();
                }
                IndexUtil.close(docidInput);
                IndexUtil.close(metaInput1);
                //i_index++;
            }
        } finally {
            tt.finished();
        }
        metaBuilder.close();
        docidOutput.finishedCollections();
        if (FieldScore.FIELDS_COUNT > 0) {
            currentIndex.addIndexStructure("document-factory", FieldDocumentIndexEntry.Factory.class.getName(),
                    "java.lang.String", "${index.inverted.fields.count}");
        } else {
            currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "",
                    "");
        }
        logger.info("Finished merging document indices from " + src.length + " map tasks: " + docCount
                + " documents found");
    }

    /** finishes the reduce step, by closing the lexicon and inverted file output,
       * building the lexicon hash and index, and merging the document indices created
      * by the map tasks. The output index finalised */
    protected void closeReduce() throws IOException {

        if (!reduceStarted) {
            logger.warn("No terms were input, skipping reduce close");
            return;
        }
        //generate final index structures
        //1. any remaining lexicon terms
        merger.endMerge(lexstream);
        //2. the end of the inverted file
        merger.getBos().close();
        lexstream.close();

        //index updating is ONLY for 
        currentIndex.addIndexStructure("inverted", invertedIndexClass,
                "org.terrier.structures.Index,java.lang.String,org.terrier.structures.DocumentIndex,java.lang.Class",
                "index,structureName,document,"
                        + (FieldScore.FIELDS_COUNT > 0 ? fieldInvertedIndexPostingIteratorClass
                                : basicInvertedIndexPostingIteratorClass));
        currentIndex.addIndexStructureInputStream("inverted", invertedIndexInputStreamClass,
                "org.terrier.structures.Index,java.lang.String,java.util.Iterator,java.lang.Class",
                "index,structureName,lexicon-entry-inputstream,"
                        + (FieldScore.FIELDS_COUNT > 0 ? fieldInvertedIndexPostingIteratorClass
                                : basicInvertedIndexPostingIteratorClass));
        currentIndex.setIndexProperty("index.inverted.fields.count", "" + FieldScore.FIELDS_COUNT);
        currentIndex.setIndexProperty("index.inverted.fields.names", ArrayUtils.join(FieldScore.FIELD_NAMES, ","));

        //3. finalise the lexicon
        currentIndex.setIndexProperty("num.Terms", "" + lexstream.getNumberOfTermsWritten());
        currentIndex.setIndexProperty("num.Tokens", "" + lexstream.getNumberOfTokensWritten());
        currentIndex.setIndexProperty("num.Pointers", "" + lexstream.getNumberOfPointersWritten());
        if (FieldScore.FIELDS_COUNT > 0)
            currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(),
                    "java.lang.String", "${index.inverted.fields.count}");
        this.finishedInvertedIndexBuild();

        //the document indices are only merged if we are creating multiple indices
        //OR if this is the first reducer for a job creating a single index
        if (mutipleIndices || reduceId == 0) {
            //4. document index
            Index[] sourceIndices = new Index[MapIndexPrefixes.length];
            int numdocs = 0;
            for (int i = 0; i < MapIndexPrefixes.length; i++) {
                sourceIndices[i] = Index.createIndex(FileOutputFormat.getOutputPath(jc).toString(),
                        MapIndexPrefixes[i]);
                if (sourceIndices[i] == null)
                    throw new IOException(
                            "Could not load index from (" + FileOutputFormat.getOutputPath(jc).toString() + ","
                                    + MapIndexPrefixes[i] + ") because " + Index.getLastIndexLoadError());
                numdocs += sourceIndices[i].getCollectionStatistics().getNumberOfDocuments();
            }
            this.mergeDocumentIndex(sourceIndices, numdocs);

            //5. close the map phase indices
            for (Index i : sourceIndices) {
                i.close();
            }
        }
        currentIndex.flush();
    }

    /** Creates the RunsMerger and the RunIteratorFactory */
    protected RunsMerger createtheRunMerger() {
        logger.info("creating run merged with fields=" + useFieldInformation);
        runIteratorF = new HadoopRunIteratorFactory(null,
                (useFieldInformation ? FieldPostingInRun.class : SimplePostingInRun.class), super.numFields);
        HadoopRunsMerger tempRM = new HadoopRunsMerger(runIteratorF);
        try {
            tempRM.setBos(new BitOutputStream(currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR
                    + currentIndex.getPrefix() + ".inverted" + BitIn.USUAL_EXTENSION));
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
        return (RunsMerger) tempRM;
    }

}