List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks
public void setNumReduceTasks(int n)
From source file:com.benchmark.mapred.terasort.TeraSort.java
License:Apache License
public int run(String[] args) throws Exception { LOG.info("starting"); JobConf job = (JobConf) getConf(); Path inputDir = new Path(args[0]); if (args.length != 3) { System.out.println("ERROR: Wrong number of parameters: " + args.length + " instead of 3."); }/* ww w . j a va 2 s. c om*/ inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME); URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraSort"); job.setJarByClass(TeraSort.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormat(TeraInputFormat.class); job.setOutputFormat(TeraOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setNumReduceTasks(Integer.parseInt(args[2])); TeraInputFormat.writePartitionFile(job, partitionFile); DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); job.setInt("dfs.replication", 1); TeraOutputFormat.setFinalSync(job, true); Date startIteration = new Date(); JobClient.runJob(job); Date endIteration = new Date(); System.out.println( "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds."); LOG.info("done"); return 0; }
From source file:com.benchmark.mapred.terasort.TeraValidate.java
License:Apache License
public int run(String[] args) throws Exception { JobConf job = (JobConf) getConf(); TeraInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setJobName("TeraValidate"); job.setJarByClass(TeraValidate.class); job.setMapperClass(ValidateMapper.class); job.setReducerClass(ValidateReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // force a single reducer job.setNumReduceTasks(1); // force a single split job.setLong("mapred.min.split.size", Long.MAX_VALUE); job.setInputFormat(TeraInputFormat.class); JobClient.runJob(job);/*from www . j a v a 2 s . c o m*/ return 0; }
From source file:com.bigdata.diane.MiniTestDFSIO.java
License:Apache License
@SuppressWarnings("deprecation") private static void runIOTest(Class<? extends Mapper<Text, LongWritable, Text, Text>> mapperClass, Path outputDir, Configuration fsConfig) throws IOException { JobConf job = new JobConf(fsConfig, MiniTestDFSIO.class); FileInputFormat.setInputPaths(job, CONTROL_DIR); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(mapperClass);// w w w. j a v a2 s . com job.setReducerClass(AccumulatingReducer.class); FileOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); JobClient.runJob(job); }
From source file:com.cloudera.sqoop.orm.TestParseMethods.java
License:Apache License
public void runParseTest(String fieldTerminator, String lineTerminator, String encloser, String escape, boolean encloseRequired) throws IOException { ClassLoader prevClassLoader = null; String[] argv = getArgv(true, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); runImport(argv);//ww w.j a va2 s. c o m try { String tableClassName = getTableName(); argv = getArgv(false, fieldTerminator, lineTerminator, encloser, escape, encloseRequired); SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true); CompilationManager compileMgr = new CompilationManager(opts); String jarFileName = compileMgr.getJarFilename(); // Make sure the user's class is loaded into our address space. prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName); JobConf job = new JobConf(); job.setJar(jarFileName); // Tell the job what class we're testing. job.set(ReparseMapper.USER_TYPE_NAME_KEY, tableClassName); // use local mode in the same JVM. ConfigurationHelper.setJobtrackerAddr(job, "local"); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } String warehouseDir = getWarehouseDir(); Path warehousePath = new Path(warehouseDir); Path inputPath = new Path(warehousePath, getTableName()); Path outputPath = new Path(warehousePath, getTableName() + "-out"); job.setMapperClass(ReparseMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); JobClient.runJob(job); } catch (InvalidOptionsException ioe) { fail(ioe.toString()); } catch (ParseException pe) { fail(pe.toString()); } finally { if (null != prevClassLoader) { ClassLoaderStack.setCurrentClassLoader(prevClassLoader); } } }
From source file:com.cloudera.sqoop.orm.TestParseMethods.java
License:Apache License
public void testFieldSetter() throws IOException { ClassLoader prevClassLoader = null; String[] types = { "VARCHAR(32)", "VARCHAR(32)" }; String[] vals = { "'meep'", "'foo'" }; createTableWithColTypes(types, vals); String[] argv = getArgv(true, ",", "\\n", "\\\'", "\\", false); runImport(argv);/*from w w w. ja v a 2 s.co m*/ try { String tableClassName = getTableName(); argv = getArgv(false, ",", "\\n", "\\\'", "\\", false); SqoopOptions opts = new ImportTool().parseArguments(argv, null, null, true); CompilationManager compileMgr = new CompilationManager(opts); String jarFileName = compileMgr.getJarFilename(); // Make sure the user's class is loaded into our address space. prevClassLoader = ClassLoaderStack.addJarFile(jarFileName, tableClassName); JobConf job = new JobConf(); job.setJar(jarFileName); // Tell the job what class we're testing. job.set(ExplicitSetMapper.USER_TYPE_NAME_KEY, tableClassName); job.set(ExplicitSetMapper.SET_COL_KEY, BASE_COL_NAME + "0"); job.set(ExplicitSetMapper.SET_VAL_KEY, "this-is-a-test"); // use local mode in the same JVM. ConfigurationHelper.setJobtrackerAddr(job, "local"); if (!BaseSqoopTestCase.isOnPhysicalCluster()) { job.set(CommonArgs.FS_DEFAULT_NAME, CommonArgs.LOCAL_FS); } String warehouseDir = getWarehouseDir(); Path warehousePath = new Path(warehouseDir); Path inputPath = new Path(warehousePath, getTableName()); Path outputPath = new Path(warehousePath, getTableName() + "-out"); job.setMapperClass(ExplicitSetMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); JobClient.runJob(job); } catch (InvalidOptionsException ioe) { fail(ioe.toString()); } catch (ParseException pe) { fail(pe.toString()); } finally { if (null != prevClassLoader) { ClassLoaderStack.setCurrentClassLoader(prevClassLoader); } } }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("m", "model", true, "location of the model"); // parse the command line arguments CommandLine line = null;/*from www. j a va 2s.c o m*/ try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); String model = line.getOptionValue("m"); if (line.hasOption("help")) { formatter.printHelp("ClassifierJob", options); return 0; } if (model == null | input == null | output == null) { formatter.printHelp("ClassifierJob", options); return -1; } } catch (ParseException e) { formatter.printHelp("ClassifierJob", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); String modelPath = line.getOptionValue("m"); JobConf job = new JobConf(getConf()); // push the model file to the DistributedCache DistributedCache.addCacheArchive(new URI(modelPath), job); job.setJarByClass(this.getClass()); job.setJobName("ClassifierJob : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TextClassifierMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.set(modelNameParam, modelPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.es.ESIndexerJob.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 1) { String syntax = "com.digitalpebble.behemoth.ESIndexerJob input"; System.err.println(syntax); return -1; }// www .j a v a 2 s . c o m Path inputPath = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Indexing " + inputPath + " into ElasticSearch"); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputValueClass(MapWritable.class); job.setMapperClass(BehemothToESMapper.class); job.setSpeculativeExecution(false); // disable speculative execution // when writing to ES // job.set("es.resource", "radio/artists"); // index used for storing // data job.setOutputFormat(EsOutputFormat.class); // use dedicated output // format FileInputFormat.addInputPath(job, inputPath); // no reducer : send straight to elasticsearch at end of mapping job.setNumReduceTasks(0); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("ESIndexerJob completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception while running job", e); return -1; } return 0; }
From source file:com.digitalpebble.behemoth.gate.GATEDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length < 3 | args.length > 4) { String syntax = "com.digitalpebble.behemoth.gate.GATEDriver in out path_gate_file [-XML]"; System.err.println(syntax); return -1; }/* w ww .j a v a2 s. c om*/ boolean dumpGATEXML = false; for (String arg : args) { if (arg.equalsIgnoreCase("-xml")) dumpGATEXML = true; } Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); String zip_application_path = args[2]; // check that the GATE application has been stored on HDFS Path zap = new Path(zip_application_path); if (fs.exists(zap) == false) { System.err .println("The GATE application " + zip_application_path + "can't be found on HDFS - aborting"); return -1; } JobConf job = new JobConf(getConf()); // MUST not forget the line below job.setJarByClass(this.getClass()); job.setJobName("Processing " + args[0] + " with GATE application from " + zip_application_path); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); if (dumpGATEXML) { job.setOutputValueClass(Text.class); job.setMapperClass(GATEXMLMapper.class); } else { job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(GATEMapper.class); } // detect if any filters have been defined // and activate the reducer accordingly boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // push the zipped_gate_application onto the DistributedCache DistributedCache.addCacheArchive(new URI(zip_application_path), job); job.set("gate.application.path", zip_application_path.toString()); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("GATEDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception caught", e); // leave even partial output // fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.io.nutch.NutchSegmentConverterJob.java
License:Apache License
public void convert(List<Path> list, Path output) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("Converting Nutch segments"); job.setJarByClass(this.getClass()); for (Path p : list) { FileInputFormat.addInputPath(job, new Path(p, Content.DIR_NAME)); }/*w w w .j a va 2 s.co m*/ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(NutchSegmentConverterJob.class); // no reducers job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("NutchSegmentConverter completed. Timing: " + (finish - start) + " ms"); } }
From source file:com.digitalpebble.behemoth.io.warc.WARCConverterJob.java
License:Apache License
public void convert(Path warcpath, Path output) throws IOException { JobConf job = new JobConf(getConf()); job.setJobName("Convert WARC " + warcpath); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, warcpath); job.setInputFormat(WarcFileInputFormat.class); job.setMapperClass(WARCConverterJob.class); // no reducers job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); long start = System.currentTimeMillis(); JobClient.runJob(job);/*from w ww .j ava 2s. co m*/ long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("WARCConverterJob completed. Timing: " + (finish - start) + " ms"); } }