List of usage examples for org.apache.hadoop.mapred JobConf setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> theClass)
From source file:org.cloudata.examples.upload.SimpleUploaderMapReduce.java
License:Apache License
public void run(String[] args) throws IOException { if (args.length < 3) { System.out.println("Usage: java SimpleUploaderMapReduce <input path> <table name> <# reduce>"); System.exit(0);/* w w w. j a v a 2s . c om*/ } Path inputPath = new Path(args[0]); String tableName = args[1]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, tableName)) { TableSchema tableSchema = new TableSchema(tableName); tableSchema.addColumn("Col1"); Row.Key[] rowKeys = new Row.Key[20]; for (int i = 0; i < 10; i++) { rowKeys[i] = new Row.Key("-0" + i); } for (int i = 1; i < 10; i++) { rowKeys[9 + i] = new Row.Key("0" + i); } rowKeys[19] = Row.Key.MAX_KEY; CTable.createTable(nconf, tableSchema, rowKeys); } JobConf jobConf = new JobConf(HdfsToCloudataMapReduce.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); // <MAP> FileInputFormat.addInputPath(jobConf, inputPath); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(SimpleUploaderMapper.class); jobConf.setPartitionerClass(KeyRangePartitioner.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); // </MAP> // <REDUCE> FileOutputFormat.setOutputPath(jobConf, new Path("SimpleUploaderMapReduce_" + System.currentTimeMillis())); jobConf.setReducerClass(SimpleUploaderReducer.class); jobConf.setNumReduceTasks(Integer.parseInt(args[2])); jobConf.setMaxReduceAttempts(0); // </REDUCE> try { JobClient.runJob(jobConf); } catch (Exception e) { e.printStackTrace(); } finally { FileSystem fs = FileSystem.get(jobConf); fs.delete(FileOutputFormat.getOutputPath(jobConf), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.web.TermUploadJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]"); System.exit(0);// w w w .j a v a 2 s .co m } JobConf jobConf = new JobConf(TermUploadJob.class); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2; if (options.length > 1) { maxReduce = Integer.parseInt(options[1]); } jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000); FileSystem fs = FileSystem.get(jobConf); CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, TERM_TABLE)) { //Table Path path = new Path("blogdata/tmp/weight"); FileStatus[] paths = fs.listStatus(path); if (paths == null || paths.length == 0) { LOG.error("No Partition info:" + path); return; } SortedSet<Text> terms = new TreeSet<Text>(); Text text = new Text(); for (FileStatus eachPath : paths) { CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath())); while (true) { int length = reader.readLine(text); if (length <= 0) { break; } terms.add(new Text(text)); } } int temrsPerTablet = terms.size() / (maxReduce - 1); int count = 0; List<Row.Key> rowKeys = new ArrayList<Row.Key>(); for (Text term : terms) { count++; if (count == temrsPerTablet) { rowKeys.add(new Row.Key(term.getBytes())); count = 0; } } rowKeys.add(Row.Key.MAX_KEY); TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS); CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {})); } CTable termTable = CTable.openTable(nconf, TERM_TABLE); TabletInfo[] tabletInfos = termTable.listTabletInfos(); Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis()); jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")"); FileInputFormat.addInputPath(jobConf, new Path(options[0])); //<MAP> jobConf.setMapperClass(TermUploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setInputFormat(TextInputFormat.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE); jobConf.setPartitionerClass(WebKeyRangePartitioner.class); jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(TermUploadReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setNumReduceTasks(tabletInfos.length); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(maxReduce); jobConf.setMaxReduceAttempts(0); //<REDUCE> //Run Job JobClient.runJob(jobConf); fs.delete(tempOutputPath); }
From source file:org.cloudata.util.matrix.AbstractMatrix.java
License:Apache License
public void mutiply(AbstractMatrix targetMatrix, AbstractMatrix resultMatrix) throws IOException { Path tempOutputPath = new Path("temp/Matrix_" + System.currentTimeMillis()); JobConf jobConf = new JobConf(AbstractMatrix.class); jobConf.setJobName("Matrix_Mutiply_Job" + "(" + new Date() + ")"); //<MAP> jobConf.setMapperClass(MatrixMutiplyMap.class); jobConf.setInputFormat(MatrixInputFormat.class); jobConf.set(MatrixInputFormat.MATRIX_INPUT_TABLE, ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_INPUT_COLUMN, columnName); jobConf.set(MatrixInputFormat.MATRIX_TARGET_TABLE, targetMatrix.ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_TARGET_COLUMN, targetMatrix.columnName); jobConf.setBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, targetMatrix.isSparse()); jobConf.setMapOutputKeyClass(MatrixItem.class); jobConf.setMapOutputValueClass(Text.class); //</MAP> //<REDUCE> jobConf.setPartitionerClass(KeyRangePartitioner.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, resultMatrix.ctable.getTableName()); jobConf.setReducerClass(MatrixMutiplyReduce.class); jobConf.set(MatrixInputFormat.MATRIX_RESULT_TABLE, resultMatrix.ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_RESULT_COLUMN, resultMatrix.columnName); jobConf.setBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, resultMatrix.isSparse()); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); TabletInfo[] tabletInfos = resultMatrix.ctable.listTabletInfos(); jobConf.setNumReduceTasks(tabletInfos.length); jobConf.setMaxReduceAttempts(0);/*from ww w. j av a 2 s . co m*/ FileOutputFormat.setOutputPath(jobConf, tempOutputPath); //</REDUCE> //Run Job JobClient.runJob(jobConf); //delete temp output path FileSystem fs = FileSystem.get(jobConf); fs.delete(tempOutputPath, true); }
From source file:org.commoncrawl.mapred.segmenter.Segmenter.java
License:Open Source License
public static boolean generateCrawlSegments(long timestamp, String[] crawlerArray, Path bundleInputPath, Path finalOutputPath) {// w w w . jav a 2 s . c o m try { FileSystem fs = CrawlEnvironment.getDefaultFileSystem(); Configuration conf = CrawlEnvironment.getHadoopConfig(); final Path tempOutputDir = new Path( CrawlEnvironment.getHadoopConfig().get("mapred.temp.dir", ".") + System.currentTimeMillis()); JobConf job = new JobConf(conf); // compute crawlers string ... String crawlers = new String(); for (int i = 0; i < crawlerArray.length; ++i) { if (i != 0) crawlers += ","; crawlers += crawlerArray[i]; } LOG.info("Segment Generator: crawlers:" + crawlers); job.set(CrawlEnvironment.PROPERTY_CRAWLERS, crawlers); LOG.info("Crawler Count:" + crawlerArray.length); job.setInt(CrawlEnvironment.PROPERTY_NUM_CRAWLERS, crawlerArray.length); LOG.info("Num Buckets Per Crawler:" + NUM_BUCKETS_PER_CRAWLER); job.setInt(CrawlEnvironment.PROPERTY_NUM_BUCKETS_PER_CRAWLER, NUM_BUCKETS_PER_CRAWLER); job.setJobName("Generate Segments"); for (FileStatus candidate : fs.globStatus(new Path(bundleInputPath, "part-*"))) { LOG.info("Adding File:" + candidate.getPath()); job.addInputPath(candidate.getPath()); } // multi file merger job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputKeyClass(SegmentGeneratorBundleKey.class); job.setMapOutputValueClass(SegmentGeneratorItemBundle.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(SegmenterReducer.class); job.setPartitionerClass(BundleKeyPartitioner.class); job.setOutputKeyComparatorClass(BundleKeyComparator.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputPath(tempOutputDir); job.setNumTasksToExecutePerJvm(1000); job.setNumReduceTasks(crawlerArray.length * NUM_BUCKETS_PER_CRAWLER); LOG.info("Running Segmenter OutputDir:" + tempOutputDir); JobClient.runJob(job); LOG.info("Finished Running Segmenter OutputDir:" + tempOutputDir + " Final Output Dir:" + finalOutputPath); fs.rename(tempOutputDir, finalOutputPath); return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } }
From source file:org.gbif.ocurrence.index.solr.ConfTester.java
License:Apache License
public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount, long reduceSleepTime, int reduceSleepCount) { JobConf job = new JobConf(getConf(), ConfTester.class); job.setNumMapTasks(numMapper);/*from ww w. j a v a 2 s. com*/ job.setNumReduceTasks(numReducer); job.setMapperClass(ConfTester.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(ConfTester.class); job.setOutputFormat(NullOutputFormat.class); job.setInputFormat(SleepInputFormat.class); job.setPartitionerClass(ConfTester.class); job.setSpeculativeExecution(false); job.setJobName("Sleep job"); FileInputFormat.addInputPath(job, new Path("ignored")); job.setLong("sleep.job.map.sleep.time", mapSleepTime); job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime); job.setInt("sleep.job.map.sleep.count", mapSleepCount); job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount); return job; }
From source file:org.gridgain.grid.kernal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java
License:Open Source License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails./*from w ww .j ava 2s . c o m*/ */ public void testMultiReducerWholeMapReduceExecution() throws Exception { GridGgfsPath inDir = new GridGgfsPath(PATH_INPUT); ggfs.mkdirs(inDir); GridGgfsPath inFile = new GridGgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; ggfs.delete(new GridGgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(ggfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(ggfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(GridHadoopWordCount2.class); GridFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.hxx.hadoop.GeneratorHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, long topN, int reduceCnt, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: from table=" + table + " segment=" + segment); JobConf job = new NutchJob(getConf()); // job.setJarByClass(GeneratorHbase.class); job.setJobName("generate:" + table + " " + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment); if (reduceCnt == -1) { reduceCnt = job.getNumMapTasks(); // a partition per fetch task }//www. ja va 2 s . co m if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) { LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); reduceCnt = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCECNT, reduceCnt); job.setInt("partition.url.seed", new Random().nextInt()); job.setInputFormat(TableTopInputFormat.class);// ? job.setMapperClass(GenerateMark.class);// generate? job.setPartitionerClass(GenerateMark.class); job.setNumReduceTasks(reduceCnt); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = JobClient.runJob(job); return r; }
From source file:org.hxx.hadoop.GeneratorMapHbase.java
License:Apache License
private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force) throws IOException { LOG.info("Generator: segment: " + segment); JobConf job = new NutchJob(getConf()); job.setJarByClass(GeneratorMapHbase.class); job.setJobName("generate: from " + table + " " + (new SimpleDateFormat("yyyyMMdd HH:mm:ss")).format(System.currentTimeMillis())); // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000); if (numLists == -1) { numLists = job.getNumMapTasks(); // a partition per fetch task }//from w w w . j a va 2 s .co m numLists = 4;// TODO if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("Generator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } // job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.set(GENERATL_TABLE, table); job.setInt(GENERATL_REDUCENUM, numLists); job.setInputFormat(TableTopInputFormat.class);// ? job.setMapperClass(GenerateMark.class);// generate? job.setPartitionerClass(URLCountPartitioner.class); job.setNumReduceTasks(numLists); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); job.setOutputKeyComparatorClass(HashComparator.class); Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME); FileOutputFormat.setOutputPath(job, output); RunningJob r = null; try { r = JobClient.runJob(job); } catch (IOException e) { throw e; } return r; }
From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraSortJob.java
License:Apache License
@SuppressWarnings("ProhibitedExceptionDeclared") @Override/* w w w . ja v a 2s . co m*/ public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraConstants.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraConstants.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSortJob.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setBoolean(ClusterConstants.MAPRED_DISABLE_TOOL_WARNING, true); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); job.setInt("mapred.submit.replication", 1); TeraOutputFormat.setFinalSync(job, true); RunningJob runningJob = JobClient.runJob(job); LOG.info("done"); return 0; }
From source file:org.terrier.applications.HadoopIndexing.java
License:Mozilla Public License
/** Starts the MapReduce indexing. * @param args// w ww .ja v a 2 s . c o m * @throws Exception */ public static void main(String[] args) throws Exception { long time = System.currentTimeMillis(); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.info("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return; } else if (args.length == 0) { logger.info("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } else { logger.fatal(usage()); return; } if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJobName("terrierIndexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return; } boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } JobID jobId = null; boolean ranOK = true; try { RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } if (ranOK) { if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); } System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); }