Example usage for org.apache.hadoop.mapred JobConf setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> theClass)

Source Link

Document

Set the Partitioner class used to partition Mapper -outputs to be sent to the Reducer s.

Usage

From source file:org.cloudata.examples.upload.SimpleUploaderMapReduce.java

License:Apache License

public void run(String[] args) throws IOException {
    if (args.length < 3) {
        System.out.println("Usage: java SimpleUploaderMapReduce <input path> <table name> <# reduce>");
        System.exit(0);/*  w  w  w. j a v a 2s .  c  om*/
    }

    Path inputPath = new Path(args[0]);
    String tableName = args[1];

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, tableName)) {
        TableSchema tableSchema = new TableSchema(tableName);
        tableSchema.addColumn("Col1");

        Row.Key[] rowKeys = new Row.Key[20];
        for (int i = 0; i < 10; i++) {
            rowKeys[i] = new Row.Key("-0" + i);
        }
        for (int i = 1; i < 10; i++) {
            rowKeys[9 + i] = new Row.Key("0" + i);
        }
        rowKeys[19] = Row.Key.MAX_KEY;

        CTable.createTable(nconf, tableSchema, rowKeys);
    }
    JobConf jobConf = new JobConf(HdfsToCloudataMapReduce.class);
    String libDir = CloudataMapReduceUtil.initMapReduce(jobConf);

    // <MAP>
    FileInputFormat.addInputPath(jobConf, inputPath);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.setMapperClass(SimpleUploaderMapper.class);
    jobConf.setPartitionerClass(KeyRangePartitioner.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName);
    // </MAP>

    // <REDUCE>
    FileOutputFormat.setOutputPath(jobConf, new Path("SimpleUploaderMapReduce_" + System.currentTimeMillis()));
    jobConf.setReducerClass(SimpleUploaderReducer.class);
    jobConf.setNumReduceTasks(Integer.parseInt(args[2]));
    jobConf.setMaxReduceAttempts(0);
    // </REDUCE>

    try {
        JobClient.runJob(jobConf);
    } catch (Exception e) {
        e.printStackTrace();
    } finally {
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(FileOutputFormat.getOutputPath(jobConf), true);
        CloudataMapReduceUtil.clearMapReduce(libDir);
    }
}

From source file:org.cloudata.examples.web.TermUploadJob.java

License:Apache License

public void exec(String[] options) throws Exception {
    if (options.length < 1) {
        System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]");
        System.exit(0);// w  w w  .j  a v a 2 s  .co  m
    }
    JobConf jobConf = new JobConf(TermUploadJob.class);
    JobClient jobClinet = new JobClient(jobConf);
    int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2;
    if (options.length > 1) {
        maxReduce = Integer.parseInt(options[1]);
    }

    jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000);

    FileSystem fs = FileSystem.get(jobConf);

    CloudataConf nconf = new CloudataConf();
    if (!CTable.existsTable(nconf, TERM_TABLE)) {
        //Table  
        Path path = new Path("blogdata/tmp/weight");
        FileStatus[] paths = fs.listStatus(path);
        if (paths == null || paths.length == 0) {
            LOG.error("No Partition info:" + path);
            return;
        }
        SortedSet<Text> terms = new TreeSet<Text>();
        Text text = new Text();
        for (FileStatus eachPath : paths) {
            CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath()));
            while (true) {
                int length = reader.readLine(text);
                if (length <= 0) {
                    break;
                }
                terms.add(new Text(text));
            }
        }

        int temrsPerTablet = terms.size() / (maxReduce - 1);
        int count = 0;
        List<Row.Key> rowKeys = new ArrayList<Row.Key>();
        for (Text term : terms) {
            count++;
            if (count == temrsPerTablet) {
                rowKeys.add(new Row.Key(term.getBytes()));
                count = 0;
            }
        }
        rowKeys.add(Row.Key.MAX_KEY);

        TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS);
        CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {}));
    }
    CTable termTable = CTable.openTable(nconf, TERM_TABLE);
    TabletInfo[] tabletInfos = termTable.listTabletInfos();

    Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis());

    jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")");
    FileInputFormat.addInputPath(jobConf, new Path(options[0]));

    //<MAP>
    jobConf.setMapperClass(TermUploadMap.class);
    jobConf.setMapOutputKeyClass(Text.class);
    jobConf.setMapOutputValueClass(Text.class);
    jobConf.setInputFormat(TextInputFormat.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE);
    jobConf.setPartitionerClass(WebKeyRangePartitioner.class);
    jobConf.setMaxMapAttempts(0);
    //</MAP>

    //<REDUCE>
    jobConf.setReducerClass(TermUploadReduce.class);
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);
    jobConf.setNumReduceTasks(tabletInfos.length);
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    jobConf.setNumReduceTasks(maxReduce);
    jobConf.setMaxReduceAttempts(0);
    //<REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    fs.delete(tempOutputPath);
}

From source file:org.cloudata.util.matrix.AbstractMatrix.java

License:Apache License

public void mutiply(AbstractMatrix targetMatrix, AbstractMatrix resultMatrix) throws IOException {
    Path tempOutputPath = new Path("temp/Matrix_" + System.currentTimeMillis());

    JobConf jobConf = new JobConf(AbstractMatrix.class);
    jobConf.setJobName("Matrix_Mutiply_Job" + "(" + new Date() + ")");

    //<MAP>
    jobConf.setMapperClass(MatrixMutiplyMap.class);
    jobConf.setInputFormat(MatrixInputFormat.class);
    jobConf.set(MatrixInputFormat.MATRIX_INPUT_TABLE, ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_INPUT_COLUMN, columnName);
    jobConf.set(MatrixInputFormat.MATRIX_TARGET_TABLE, targetMatrix.ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_TARGET_COLUMN, targetMatrix.columnName);
    jobConf.setBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, targetMatrix.isSparse());
    jobConf.setMapOutputKeyClass(MatrixItem.class);
    jobConf.setMapOutputValueClass(Text.class);
    //</MAP>

    //<REDUCE>
    jobConf.setPartitionerClass(KeyRangePartitioner.class);
    jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, resultMatrix.ctable.getTableName());
    jobConf.setReducerClass(MatrixMutiplyReduce.class);
    jobConf.set(MatrixInputFormat.MATRIX_RESULT_TABLE, resultMatrix.ctable.getTableName());
    jobConf.set(MatrixInputFormat.MATRIX_RESULT_COLUMN, resultMatrix.columnName);
    jobConf.setBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, resultMatrix.isSparse());
    jobConf.setOutputKeyClass(Text.class);
    jobConf.setOutputValueClass(Text.class);

    TabletInfo[] tabletInfos = resultMatrix.ctable.listTabletInfos();

    jobConf.setNumReduceTasks(tabletInfos.length);
    jobConf.setMaxReduceAttempts(0);/*from ww w.  j av  a 2 s  . co m*/
    FileOutputFormat.setOutputPath(jobConf, tempOutputPath);
    //</REDUCE>

    //Run Job
    JobClient.runJob(jobConf);

    //delete temp output path
    FileSystem fs = FileSystem.get(jobConf);
    fs.delete(tempOutputPath, true);
}

From source file:org.commoncrawl.mapred.segmenter.Segmenter.java

License:Open Source License

public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath,
        Path finalOutputPath) {//  w w  w  . jav a  2  s . c o m
    try {

        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();
        Configuration conf = CrawlEnvironment.getHadoopConfig();

        final Path tempOutputDir = new Path(
                CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis());

        JobConf job = new JobConf(conf);

        // compute crawlers string ... 
        String crawlers = new String();

        for (int i = 0; i < crawlerArray.length; ++i) {
            if (i != 0)
                crawlers += ",";
            crawlers += crawlerArray[i];
        }

        LOG.info("Segment Generator:  crawlers:" + crawlers);

        job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers);
        LOG.info("Crawler Count:" + crawlerArray.length);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length);
        LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER);
        job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER);
        job.setJobName("Generate Segments");

        for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) {
            LOG.info("Adding File:" + candidate.getPath());
            job.addInputPath(candidate.getPath());
        }

        // multi file merger 
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class);
        job.setMapOutputValueClass(SegmentGeneratorItemBundle.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(SegmenterReducer.class);
        job.setPartitionerClass(BundleKeyPartitioner.class);
        job.setOutputKeyComparatorClass(BundleKeyComparator.class);
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputPath(tempOutputDir);
        job.setNumTasksToExecutePerJvm(1000);
        job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER);

        LOG.info("Running  Segmenter OutputDir:" + tempOutputDir);
        JobClient.runJob(job);
        LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:"
                + finalOutputPath);

        fs.rename(tempOutputDir, finalOutputPath);

        return true;
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        return false;
    }
}

From source file:org.gbif.ocurrence.index.solr.ConfTester.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), ConfTester.class);
    job.setNumMapTasks(numMapper);/*from   ww  w. j a  v a 2 s. com*/
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(ConfTester.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(ConfTester.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(ConfTester.class);
    job.setSpeculativeExecution(false);
    job.setJobName("Sleep job");
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java

License:Open Source License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails./*from w  ww .j  ava  2s  .  c  o m*/
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT);

    ggfs.mkdirs(inDir);

    GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(GridHadoopWordCount2.class);

        GridFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.hxx.hadoop.GeneratorHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, long topN, int reduceCnt, boolean filter,
        boolean norm, boolean force) throws IOException {
    LOG.info("Generator: from table=" + table + " segment=" + segment);

    JobConf job = new NutchJob(getConf());
    // job.setJarByClass(GeneratorHbase.class);
    job.setJobName("generate:" + table + " "
            + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment);

    if (reduceCnt == -1) {
        reduceCnt = job.getNumMapTasks(); // a partition per fetch task
    }//www. ja va 2 s .  co m
    if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) {
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        reduceCnt = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCECNT, reduceCnt);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(TableTopInputFormat.class);// ?
    job.setMapperClass(GenerateMark.class);// generate?

    job.setPartitionerClass(GenerateMark.class);
    job.setNumReduceTasks(reduceCnt);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = JobClient.runJob(job);
    return r;
}

From source file:org.hxx.hadoop.GeneratorMapHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
        boolean filter, boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment: " + segment);

    JobConf job = new NutchJob(getConf());
    job.setJarByClass(GeneratorMapHbase.class);
    job.setJobName("generate: from " + table + " "
            + (new SimpleDateFormat("yyyyMMdd HH:mm:ss")).format(System.currentTimeMillis()));
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (numLists == -1) {
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }//from  w  w w  . j a va 2  s  .co  m
    numLists = 4;// TODO
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCENUM, numLists);

    job.setInputFormat(TableTopInputFormat.class);// ?
    job.setMapperClass(GenerateMark.class);// generate?

    job.setPartitionerClass(URLCountPartitioner.class);
    job.setNumReduceTasks(numLists);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = null;
    try {
        r = JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }
    return r;
}

From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraSortJob.java

License:Apache License

@SuppressWarnings("ProhibitedExceptionDeclared")
@Override/*  w w  w  .  ja  v  a 2s  . co m*/
public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraConstants.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraConstants.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSortJob.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setBoolean(ClusterConstants.MAPRED_DISABLE_TOOL_WARNING, true);

    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    job.setInt("mapred.submit.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    RunningJob runningJob = JobClient.runJob(job);
    LOG.info("done");
    return 0;
}

From source file:org.terrier.applications.HadoopIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args// w  ww .ja v  a  2 s . c  o  m
 * @throws Exception
 */
public static void main(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.info("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return;
    } else if (args.length == 0) {
        logger.info("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    } else {
        logger.fatal(usage());
        return;
    }

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJobName("terrierIndexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return;
    }

    boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));
    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    JobID jobId = null;
    boolean ranOK = true;
    try {
        RunningJob rj = JobClient.runJob(conf);
        jobId = rj.getID();
        HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) {
        logger.error("Problem running job", e);
        ranOK = false;
    }
    if (jobId != null) {
        deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }
    if (ranOK) {
        if (!docPartitioned) {
            if (numberOfReducers > 1)
                mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        }

        Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
                docPartitioned ? numberOfReducers : 1, jf);
    }
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
}