List of usage examples for org.apache.hadoop.mapred JobConf setJarByClass
public void setJarByClass(Class cls)
From source file:com.google.mr4c.hadoop.yarn.YarnTestBinding.java
License:Open Source License
public JobConf createTestMRJobConf() throws IOException { if (m_mrCluster == null) { startMrCluster();//ww w. j a v a2 s . co m } JobConf job = new JobConf(m_mrCluster.getConfig()); job.setJarByClass(AlgoRunner.class); return job; }
From source file:com.hadoopilluminated.examples.dancing.DistributedPentomino.java
License:Apache License
public int run(String[] args) throws Exception { JobConf conf; int depth = 5; int width = 9; int height = 10; Class<? extends Pentomino> pentClass; if (args.length == 0) { System.out.println("pentomino <output>"); ToolRunner.printGenericCommandUsage(System.out); return -1; }/*from w ww. j a v a2 s. c o m*/ conf = new JobConf(getConf()); width = conf.getInt("pent.width", width); height = conf.getInt("pent.height", height); depth = conf.getInt("pent.depth", depth); pentClass = conf.getClass("pent.class", OneSidedPentonimo.class, Pentomino.class); Path output = new Path(args[0]); Path input = new Path(output + "_input"); FileSystem fileSys = FileSystem.get(conf); try { FileInputFormat.setInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, output); conf.setJarByClass(PentMap.class); conf.setJobName("dancingElephant"); Pentomino pent = ReflectionUtils.newInstance(pentClass, conf); pent.initialize(width, height); createInputDirectory(fileSys, input, pent, depth); // the keys are the prefix strings conf.setOutputKeyClass(Text.class); // the values are puzzle solutions conf.setOutputValueClass(Text.class); conf.setMapperClass(PentMap.class); conf.setReducerClass(IdentityReducer.class); conf.setNumMapTasks(2000); conf.setNumReduceTasks(1); JobClient.runJob(conf); } finally { fileSys.delete(input, true); } return 0; }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMR.java
License:Open Source License
public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String specPath, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception { CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst); long[] rlens = new long[] { numRows }; long[] clens = new long[] { numColsAfter }; int[] brlens = new int[] { rblk.brlen }; int[] bclens = new int[] { rblk.bclen }; byte[] realIndexes = new byte[] { rblk.input }; byte[] resultIndexes = new byte[] { rblk.output }; JobConf job = new JobConf(ApplyTfBBMR.class); job.setJobName("ApplyTfBB"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfBBMR.class); // set relevant classes job.setMapperClass(ApplyTfBBMapper.class); MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL); MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); MRJobConfiguration.setCSVReblockInstructions(job, rblkInst); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInst); job.setInt("dfs.replication", replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false); //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed);// w ww .j a va2 s . c om //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false); // configure mapper and the mapper output key value pairs job.setMapperClass(ApplyTfBBMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(new Path(partOffsetsFile), "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specPath); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job); Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } return new JobReturn(ret.stats, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java
License:Open Source License
public static JobReturn runJob(String inputPath, String specPath, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(ApplyTfCSVMR.class); job.setJobName("ApplyTfCSV"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfCSVMR.class); // set relevant classes job.setMapperClass(ApplyTfCSVMapper.class); job.setNumReduceTasks(0);//from w w w . j av a2 s . c om // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(partOffsetsFile); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInt("dfs.replication", replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(outputPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specPath); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); // Since transform CSV produces part files w/ prefix transform-part-*, // delete all the "default" part-..... files deletePartFiles(fs, outPath); MatrixCharacteristics mc = new MatrixCharacteristics(); return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.transform.GenTfMtdMR.java
License:Open Source License
public static long runJob(String inputPath, String txMtdPath, String specFileWithIDs, String smallestFile, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(GenTfMtdMR.class); job.setJobName("GenTfMTD"); /* Setup MapReduce Job */ job.setJarByClass(GenTfMtdMR.class); // set relevant classes job.setMapperClass(GTFMTDMapper.class); job.setReducerClass(GTFMTDReducer.class); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DistinctValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setInt("dfs.replication", replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(txMtdPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true);//from ww w. j ava2s .co m FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specFileWithIDs); job.set(MRJobConfiguration.TF_SMALLEST_FILE, smallestFile); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, txMtdPath); // offsets file to store part-file names and offsets for each input split job.set(MRJobConfiguration.TF_OFFSETS_FILE, partOffsetsFile); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); Counters c = runjob.getCounters(); long tx_numRows = c.findCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS).getCounter(); return tx_numRows; }
From source file:com.ibm.jaql.lang.expr.hadoop.NativeMapReduceExpr.java
License:Apache License
private JsonRecord eval_0_0(Configuration conf) throws Exception { JobConf job = new JobConf(conf); // set the jar if needed if (useSessionJarDefault.get()) { File jFile = ClassLoaderMgr.getExtensionJar(); if (jFile != null) { job.setJar(jFile.getAbsolutePath()); } else {/*from ww w .j a v a2 s .c om*/ job.setJarByClass(NativeMapReduceExpr.class); } } // submit the job boolean status = true; try { //JobClient.runJob(job); Util.submitJob(new JsonString(NativeMapReduceExpr.class.getName()), job); } catch (IOException e) { status = false; e.printStackTrace(); LOG.warn("native map-reduce job failed", e); } // setup the return value BufferedJsonRecord ret = new BufferedJsonRecord(); ret.add(STATUS, (status) ? JsonBool.TRUE : JsonBool.FALSE); return ret; }
From source file:com.ibm.jaql.util.ClassLoaderMgr.java
License:Apache License
private JarOutputStream getJarOutputStream() { //If we have a existing jar stream just use it if (extendedJarStream != null) { return extendedJarStream; }//from w ww. ja v a 2 s. co m //Otherwise create a new jar and outputstream in which new jars //can be appended File baseJar = null; if (extendedJarPath != null) { baseJar = extendedJarPath; } else { JobConf job = new JobConf(); job.setJarByClass(JaqlUtil.class); String original = job.getJar(); if (original != null) { baseJar = new File(original); } } //Creat new temp jaql file File tmpDir = new File(System.getProperty("java.io.tmpdir") + File.separator + "jaql_" + System.nanoTime()); tmpDir.mkdir(); // TODO: figure out why this causes occasional thread dumps on linux //tmpDir.deleteOnExit(); extendedJarPath = new File(tmpDir.getAbsoluteFile() + File.separator + "jaql.jar"); BaseUtil.LOG.info("creating new jaql.jar: " + extendedJarPath + ", starting from: " + baseJar); //Copy files over into new file try { JarOutputStream jout = null; if (baseJar != null) { JarInputStream jin = new JarInputStream(new FileInputStream(baseJar)); FileOutputStream fout = new FileOutputStream(extendedJarPath); Manifest man = jin.getManifest(); jout = man == null ? new JarOutputStream(fout) : new JarOutputStream(fout, man); copyJarFile(jin, jout); } else { jout = new JarOutputStream(new FileOutputStream(extendedJarPath)); } extendedJarStream = jout; } catch (IOException e) { BaseUtil.LOG.error("Error creating jar: " + e); throw new RuntimeException(e); } return extendedJarStream; }
From source file:com.mongodb.hadoop.util.MongoTool.java
License:Apache License
private int runMapredJob(final Configuration conf) { final JobConf job = new JobConf(conf, getClass()); /**//from w w w . j a va2 s.co m * Any arguments specified with -D <property>=<value> * on the CLI will be picked up and set here * They override any XML level values * Note that -D<space> is important - no space will * not work as it gets picked up by Java itself */ // TODO - Do we need to set job name somehow more specifically? // This may or may not be correct/sane job.setJarByClass(getClass()); final Class<? extends org.apache.hadoop.mapred.Mapper> mapper = MapredMongoConfigUtil.getMapper(conf); LOG.debug("Mapper Class: " + mapper); LOG.debug("Input URI: " + conf.get(MapredMongoConfigUtil.INPUT_URI)); job.setMapperClass(mapper); Class<? extends org.apache.hadoop.mapred.Reducer> combiner = MapredMongoConfigUtil.getCombiner(conf); if (combiner != null) { job.setCombinerClass(combiner); } job.setReducerClass(MapredMongoConfigUtil.getReducer(conf)); job.setOutputFormat(MapredMongoConfigUtil.getOutputFormat(conf)); job.setOutputKeyClass(MapredMongoConfigUtil.getOutputKey(conf)); job.setOutputValueClass(MapredMongoConfigUtil.getOutputValue(conf)); job.setInputFormat(MapredMongoConfigUtil.getInputFormat(conf)); Class mapOutputKeyClass = MapredMongoConfigUtil.getMapperOutputKey(conf); Class mapOutputValueClass = MapredMongoConfigUtil.getMapperOutputValue(conf); if (mapOutputKeyClass != null) { job.setMapOutputKeyClass(mapOutputKeyClass); } if (mapOutputValueClass != null) { job.setMapOutputValueClass(mapOutputValueClass); } /** * Determines if the job will run verbosely e.g. print debug output * Only works with foreground jobs */ final boolean verbose = MapredMongoConfigUtil.isJobVerbose(conf); /** * Run job in foreground aka wait for completion or background? */ final boolean background = MapredMongoConfigUtil.isJobBackground(conf); try { RunningJob runningJob = JobClient.runJob(job); if (background) { LOG.info("Setting up and running MapReduce job in background."); return 0; } else { LOG.info("Setting up and running MapReduce job in foreground, will wait for results. {Verbose? " + verbose + "}"); runningJob.waitForCompletion(); return 0; } } catch (final Exception e) { LOG.error("Exception while executing job... ", e); return 1; } }
From source file:com.spotify.hdfs2cass.BulkLoader.java
License:Apache License
public int run(String[] args) throws Exception { CommandLine cmdLine = parseOptions(args); String[] inputPaths = cmdLine.getOptionValues('i'); String seedNodeHost = cmdLine.getOptionValue('h'); String seedNodePort = cmdLine.getOptionValue('p', "9160"); String keyspace = cmdLine.getOptionValue('k'); String colfamily = cmdLine.getOptionValue('c'); int mappers = Integer.parseInt(cmdLine.getOptionValue('m', "0")); Integer copiers = Integer.parseInt(cmdLine.getOptionValue('P', "0")); String poolName = cmdLine.getOptionValue("pool"); ClusterInfo clusterInfo = new ClusterInfo(seedNodeHost, seedNodePort); clusterInfo.init(keyspace);/*from w w w . j a v a 2 s. c o m*/ final String partitionerClass = clusterInfo.getPartitionerClass(); final int reducers = adjustReducers(Integer.parseInt(cmdLine.getOptionValue('r', "0")), clusterInfo.getNumClusterNodes()); Configuration conf = new Configuration(); ConfigHelper.setOutputColumnFamily(conf, keyspace, colfamily); ConfigHelper.setOutputInitialAddress(conf, seedNodeHost); ConfigHelper.setOutputRpcPort(conf, seedNodePort); ConfigHelper.setOutputPartitioner(conf, partitionerClass); if (cmdLine.hasOption('s')) { conf.set("mapreduce.output.bulkoutputformat.buffersize", cmdLine.getOptionValue('s', "32")); } if (cmdLine.hasOption('M')) { conf.set("mapreduce.output.bulkoutputformat.streamthrottlembits", cmdLine.getOptionValue('M')); } if (cmdLine.hasOption('C')) { ConfigHelper.setOutputCompressionClass(conf, cmdLine.getOptionValue('C')); } if (cmdLine.hasOption('b')) { conf.setBoolean("com.spotify.hdfs2cass.base64", true); } JobConf job = new JobConf(conf); if (mappers > 0) job.setNumMapTasks(mappers); if (reducers > 0) job.setNumReduceTasks(reducers); if (copiers > 0) job.set("mapred.reduce.parallel.copies", copiers.toString()); if (poolName != null) job.set("mapred.fairscheduler.pool", poolName); // set the nodes as a param for the other hadoop nodes clusterInfo.setConf(job); String jobName = "bulkloader-hdfs-to-cassandra"; if (cmdLine.hasOption('n')) jobName += "-" + cmdLine.getOptionValue('n'); job.setJobName(jobName); job.setJarByClass(BulkLoader.class); job.setInputFormat(AvroAsTextInputFormat.class); for (String inputPath : inputPaths) { FileInputFormat.addInputPath(job, new Path(inputPath)); } //map just outputs text, reduce sends to cassandra job.setMapperClass(MapToText.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(CassandraPartitioner.class); job.setReducerClass(ReduceTextToCassandra.class); job.setOutputKeyClass(ByteBuffer.class); job.setOutputValueClass(List.class); if (cmdLine.hasOption('s')) job.setOutputFormat(BulkOutputFormat.class); else job.setOutputFormat(ColumnFamilyOutputFormat.class); JobClient.runJob(job); return 0; }
From source file:com.xiaoxiaomo.mr.utils.kafka.HadoopJob.java
License:Apache License
public int run(String[] args) throws Exception { CommandLineParser parser = new PosixParser(); Options options = buildOptions();/* w ww. j a v a2 s.c o m*/ CommandLine cmd = parser.parse(options, args); if (cmd.hasOption("h") || cmd.getArgs().length == 0) { printHelpAndExit(options); } String hdfsPath = cmd.getArgs()[0]; Configuration conf = getConf(); conf.setBoolean("mapred.map.tasks.speculative.execution", false); if (cmd.hasOption("topics")) { LOG.info("Using topics: " + cmd.getOptionValue("topics")); KafkaInputFormat.configureKafkaTopics(conf, cmd.getOptionValue("topics")); } else { printHelpAndExit(options); } KafkaInputFormat.configureZkConnection(conf, cmd.getOptionValue("zk-connect", "localhost:2181")); if (cmd.hasOption("consumer-group")) { CheckpointManager.configureUseZooKeeper(conf, cmd.getOptionValue("consumer-group", "dev-hadoop-loader")); } if (cmd.getOptionValue("autooffset-reset") != null) { KafkaInputFormat.configureAutoOffsetReset(conf, cmd.getOptionValue("autooffset-reset")); } JobConf jobConf = new JobConf(conf); if (cmd.hasOption("remote")) { String ip = cmd.getOptionValue("remote"); LOG.info("Default file system: hdfs://" + ip + ":8020/"); jobConf.set("fs.defaultFS", "hdfs://" + ip + ":8020/"); LOG.info("Remote jobtracker: " + ip + ":8021"); jobConf.set("mapred.job.tracker", ip + ":8021"); } Path jarTarget = new Path( getClass().getProtectionDomain().getCodeSource().getLocation() + "../kafka-hadoop-loader.jar"); if (new File(jarTarget.toUri()).exists()) { // running from IDE/ as maven jobConf.setJar(jarTarget.toUri().getPath()); LOG.info("Using target jar: " + jarTarget.toString()); } else { // running from jar remotely or locally jobConf.setJarByClass(getClass()); LOG.info("Using parent jar: " + jobConf.getJar()); } Job job = Job.getInstance(jobConf, "kafka.hadoop.loader"); job.setInputFormatClass(KafkaInputFormat.class); job.setMapperClass(HadoopJobMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(MultiOutputFormat.class); job.setNumReduceTasks(0); MultiOutputFormat.setOutputPath(job, new Path(hdfsPath)); MultiOutputFormat.setCompressOutput(job, cmd.getOptionValue("compress-output", "on").equals("on")); LOG.info("Output hdfs location: {}", hdfsPath); LOG.info("Output hdfs compression: {}", MultiOutputFormat.getCompressOutput(job)); return job.waitForCompletion(true) ? 0 : -1; }