List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:edu.indiana.d2i.htrc.io.ParallelDataCopyJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 5) { printUsage();/*from w w w.j a v a 2 s. com*/ } String inputPath = args[0]; String outputPath = args[1]; int maxIdsPerSplit = Integer.valueOf(args[2]); String dataAPIConfClassName = args[3]; int maxIdsPerReq = Integer.valueOf(args[4]); logger.info("ParallelDataCopyJob "); logger.info(" - input: " + inputPath); // id list logger.info(" - output: " + outputPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); Job job = new Job(getConf(), "Copy data from HTRC data storage parallely."); job.setJarByClass(ParallelDataCopyJob.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("ParallelDataCopyJob took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromHDFSRawText.java
License:Apache License
private void createSparseVector(Path inputPath, Path outputPath) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(getConf(), "Create sparse vector from plain text in HDFS."); job.setJarByClass(SparseVectorsFromHDFSRawText.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); // maximum #id per split job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // dictionary and lucene job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true);//from ww w . j a va2s . c o m }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromLucene.java
License:Apache License
private void createSparseVector(Path inputPath, Path outputPath) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(getConf(), "Create sparse vector from Lucene."); job.setJarByClass(SparseVectorsFromLucene.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); // maximum #id per split job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // dictionary and lucene // job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir); job.getConfiguration().set("htrc.solr.dictionary", dictDir); job.getConfiguration().set("htrc.lucene.index.path", indexLoc); job.setInputFormatClass(LuceneIDFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true);/* www . j av a 2 s. c o m*/ }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromRawText.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 7) { printUsage();/* ww w . j a v a2 s .c o m*/ } String inputPath = args[0]; String outputPath = args[1]; String dictPath = args[2]; int maxIdsPerSplit = Integer.valueOf(args[3]); String dataAPIConfClassName = args[4]; String analyzerClassName = args[5]; int maxIdsPerReq = Integer.valueOf(args[6]); logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - dictPath: " + dictPath); logger.info(" - maxIdsPerSplit: " + maxIdsPerSplit); logger.info(" - dataAPIConfClassName: " + dataAPIConfClassName); logger.info(" - analyzerName: " + analyzerClassName); logger.info(" - maxIdsPerReq: " + maxIdsPerReq); // Job job = new Job(getConf(), "Create sparse vector from HTRC data storage."); job.setJarByClass(SparseVectorsFromRawText.class); // set dictionary job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // set data api conf job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); Utilities.setDataAPIConf(job.getConfiguration(), dataAPIConfClassName, maxIdsPerReq); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); Path output = new Path(outputPath); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, output); FileSystem.get(job.getConfiguration()).delete(output, true); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsFromSolr.java
License:Apache License
private void createSparseVector(Path inputPath, Path outputPath) throws IOException, ClassNotFoundException, InterruptedException { Job job = new Job(getConf(), "Create sparse vector from Solr."); job.setJarByClass(SparseVectorsFromSolr.class); job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); // maximum #id per split job.getConfiguration().setInt(HTRCConstants.MAX_IDNUM_SPLIT, maxIdsPerSplit); // dictionary and solr // job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictDir); job.getConfiguration().set("htrc.solr.dictionary", dictDir); job.getConfiguration().set(HTRCConstants.SOLR_MAIN_URL, solrEPR); job.setInputFormatClass(SolrIDFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.waitForCompletion(true);// w ww. j av a2 s . c o m }
From source file:edu.indiana.d2i.htrc.io.SparseVectorsToMemcached.java
License:Apache License
private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException { ////from w w w . ja v a 2s .co m Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached"); job.setJarByClass(SparseVectorsToMemcached.class); Configuration conf = job.getConfiguration(); setupConfiguration(conf); // no speculation conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(IDInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(idListDir)); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2HDFS.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();/*from w ww . j a va 2s.co m*/ } String inputPath = args[0]; String outputPath = args[1]; String dictPath = args[2]; String analyzerClassName = args[3]; logger.info("DataCopyTokenizerJob "); logger.info(" - input: " + inputPath); logger.info(" - output: " + outputPath); logger.info(" - dictPath: " + dictPath); logger.info(" - analyzerName: " + analyzerClassName); // Job job = new Job(getConf(), "Create sparse vector from HTRC data storage."); job.setJarByClass(SVFromHDFS2HDFS.class); // set dictionary job.getConfiguration().set(HTRCConstants.DICTIONARY_PATH, dictPath); // set analyzer job.getConfiguration().set(DocumentProcessor.ANALYZER_CLASS, analyzerClassName); // no speculation job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false); job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); Path output = new Path(outputPath); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, output); FileSystem.get(job.getConfiguration()).delete(output, true); long start = System.nanoTime(); job.waitForCompletion(true); logger.info("SparseVectorsFromRawText took " + (System.nanoTime() - start) / 1e9 + " seconds."); return 0; }
From source file:edu.indiana.d2i.htrc.io.SVFromHDFS2Memcached.java
License:Apache License
private void parallelTransform() throws IOException, ClassNotFoundException, InterruptedException { ///*from ww w . ja va 2s . c o m*/ Job job = new Job(getConf(), "Create sparse vectors from HTRC data storage, store them in MemCached"); job.setJarByClass(SVFromHDFS2Memcached.class); Configuration conf = job.getConfiguration(); setupConfiguration(conf); // no speculation conf.setBoolean("mapred.map.tasks.speculative.execution", false); conf.setBoolean("mapred.reduce.tasks.speculative.execution", false); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(MemCachedOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(VectorWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(SparseVectorUtil.Text2VectorMapper.class); job.setNumReduceTasks(0); FileInputFormat.setInputPaths(job, new Path(vecDir)); job.waitForCompletion(true); }
From source file:edu.indiana.d2i.htrc.kmeans.MemCachedKMeansDriver.java
License:Apache License
private static void clusterDataMR(Configuration conf, Path input, Path clustersIn, Path output, DistanceMeasure measure, String convergenceDelta) throws IOException, InterruptedException, ClassNotFoundException { conf.set(KMeansConfigKeys.CLUSTER_PATH_KEY, clustersIn.toString()); conf.set(KMeansConfigKeys.DISTANCE_MEASURE_KEY, measure.getClass().getName()); conf.set(KMeansConfigKeys.CLUSTER_CONVERGENCE_KEY, convergenceDelta); Job job = new Job(conf, "KMeans Driver running clusterData over input: " + input); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(WeightedPropertyVectorWritable.class); FileInputFormat.setInputPaths(job, input); HadoopUtil.delete(conf, output);// w w w . j a v a 2 s.c o m FileOutputFormat.setOutputPath(job, output); job.setMapperClass(KMeansClusterMapper.class); job.setNumReduceTasks(0); job.setJarByClass(MemCachedKMeansDriver.class); if (!job.waitForCompletion(true)) { throw new InterruptedException("K-Means Clustering failed processing " + clustersIn); } }
From source file:edu.indiana.soic.ts.mapreduce.pwd.PairWiseDistance.java
License:Open Source License
public int execJob(Configuration conf, String sequenceFileFullPath, String sequenceFile, String distDir) throws Exception { /* input parameters */ LOG.info(sequenceFileFullPath);//from w w w . j av a2s. com Job job = new Job(conf, "Pairwise-calc-" + sequenceFile); /* create the base dir for this job. Delete and recreates if it exists */ Path hdMainDir = new Path(distDir + "/" + sequenceFile); FileSystem fs = FileSystem.get(conf); fs.delete(hdMainDir, true); Path hdInputDir = new Path(hdMainDir, "data"); if (!fs.mkdirs(hdInputDir)) { throw new IOException("Mkdirs failed to create " + hdInputDir.toString()); } int noOfSequences = getNoOfSequences(sequenceFileFullPath, fs); int noOfDivisions = (int) Math.ceil(noOfSequences / (double) blockSize); int noOfBlocks = (noOfDivisions * (noOfDivisions + 1)) / 2; LOG.info("No of divisions :" + noOfDivisions + "\nNo of blocks :" + noOfBlocks + "\nBlock size :" + blockSize); // Retrieving the configuration form the job to set the properties // Setting properties to the original conf does not work (possible // Hadoop bug) Configuration jobConf = job.getConfiguration(); // Input dir in HDFS. Create this in newly created job base dir Path inputDir = new Path(hdMainDir, "input"); if (!fs.mkdirs(inputDir)) { throw new IOException("Mkdirs failed to create " + inputDir.toString()); } Long dataPartitionStartTime = System.nanoTime(); partitionData(sequenceFileFullPath, noOfSequences, blockSize, fs, noOfDivisions, jobConf, inputDir); distributeData(blockSize, conf, fs, hdInputDir, noOfDivisions); long dataPartTime = (System.nanoTime() - dataPartitionStartTime) / 1000000; LOG.info("Data Partition & Scatter Completed in (ms):" + dataPartTime); // Output dir in HDFS Path hdOutDir = new Path(hdMainDir, "out"); jobConf.setInt(Constants.BLOCK_SIZE, blockSize); jobConf.setInt(Constants.NO_OF_DIVISIONS, noOfDivisions); jobConf.setInt(Constants.NO_OF_SEQUENCES, noOfSequences); jobConf.set(Constants.DIST_FUNC, distFunc); job.setJarByClass(PairWiseDistance.class); job.setMapperClass(SWGMap.class); job.setReducerClass(SWGReduce.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(SWGWritable.class); FileInputFormat.setInputPaths(job, hdInputDir); FileOutputFormat.setOutputPath(job, hdOutDir); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(noOfDivisions); long startTime = System.currentTimeMillis(); int exitStatus = job.waitForCompletion(true) ? 0 : 1; double executionTime = (System.currentTimeMillis() - startTime) / 1000.0; LOG.info("Job Finished in " + executionTime + " seconds"); LOG.info("# #seq\t#blockS\tTtime\tinput\tdataDistTime\toutput" + noOfSequences + "\t" + noOfBlocks + "\t" + executionTime + "\t" + sequenceFileFullPath + "\t" + dataPartTime + "\t" + hdMainDir); return exitStatus; }