List of usage examples for org.apache.hadoop.mapreduce Job Job
Job(JobConf conf) throws IOException
From source file:com.inmobi.conduit.local.LocalStreamService.java
License:Apache License
protected Job createJob(Path inputPath, long totalSize) throws IOException { String jobName = getName();// w w w . j a v a2 s .c o m Configuration conf = currentCluster.getHadoopConf(); conf.set(ConduitConstants.AUDIT_ENABLED_KEY, System.getProperty(ConduitConstants.AUDIT_ENABLED_KEY)); Job job = new Job(conf); job.setJobName(jobName); // DistributedCache.addFileToClassPath(inputFormatJarDestPath, // job.getConfiguration()); job.getConfiguration().set("tmpjars", inputFormatJarDestPath.toString() + "," + auditUtilJarDestPath.toString()); LOG.debug("Adding file [" + inputFormatJarDestPath + "] to distributed cache"); job.setInputFormatClass(UniformSizeInputFormat.class); Class<? extends Mapper<Text, FileStatus, NullWritable, Text>> mapperClass = getMapperClass(); job.setJarByClass(mapperClass); job.setMapperClass(mapperClass); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); // setting identity reducer job.setReducerClass(Reducer.class); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, tmpCounterOutputPath); job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false"); job.getConfiguration().set(LOCALSTREAM_TMP_PATH, tmpPath.toString()); job.getConfiguration().set(SRC_FS_DEFAULT_NAME_KEY, srcCluster.getHadoopConf().get(FS_DEFAULT_NAME_KEY)); // set configurations needed for UniformSizeInputFormat int numMaps = getNumMapsForJob(totalSize); job.getConfiguration().setInt(DistCpConstants.CONF_LABEL_NUM_MAPS, numMaps); job.getConfiguration().setLong(DistCpConstants.CONF_LABEL_TOTAL_BYTES_TO_BE_COPIED, totalSize); job.getConfiguration().set(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH, inputPath.toString()); LOG.info("Expected number of maps [" + numMaps + "] Total data size [" + totalSize + "]"); return job; }
From source file:com.inmobi.databus.local.LocalStreamService.java
License:Apache License
private Job createJob(Path inputPath) throws IOException { String jobName = "localstream"; Configuration conf = cluster.getHadoopConf(); Job job = new Job(conf); job.setJobName(jobName);/* ww w .j ava 2s.c o m*/ KeyValueTextInputFormat.setInputPaths(job, inputPath); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setJarByClass(CopyMapper.class); job.setMapperClass(CopyMapper.class); job.setNumReduceTasks(0); job.setOutputFormatClass(NullOutputFormat.class); job.getConfiguration().set("mapred.map.tasks.speculative.execution", "false"); job.getConfiguration().set("localstream.tmp.path", tmpPath.toString()); return job; }
From source file:com.intel.hadoop.hbase.dot.KEY.java
License:Apache License
private void doMapReduce(Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass, String mrTableName) throws IOException, ClassNotFoundException, InterruptedException { this.conf.set(KEY.INPUT_TABLE, mrTableName); Job job = new Job(this.conf); job.setJobName("Generate Data for [" + mrTableName + "]"); job.setJarByClass(GenerateTestTable.class); job.setInputFormatClass(inputFormatClass); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(LongWritable.class); FileSystem fs = FileSystem.get(conf); Path path = new Path("/tmp", "tempout"); fs.delete(path, true);/*from w w w .j a va 2 s . com*/ FileOutputFormat.setOutputPath(job, path); job.setMapperClass(mapperClass); job.setNumReduceTasks(0); TableMapReduceUtil.addDependencyJars(job); // Add a Class from the hbase.jar so it gets registered too. TableMapReduceUtil.addDependencyJars(job.getConfiguration(), org.apache.hadoop.hbase.util.Bytes.class); TableMapReduceUtil.initCredentials(job); job.waitForCompletion(true); }
From source file:com.kasabi.labs.freebase.mr.Freebase2RDFDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (log.isDebugEnabled()) { log.debug("run({})", Utils.toString(args)); }/*from w w w. j a va2 s .co m*/ if (args.length != 2) { System.err.printf("Usage: %s [generic options] <input> <output>\n", getClass().getName()); ToolRunner.printGenericCommandUsage(System.err); return -1; } Configuration configuration = getConf(); boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION, Constants.OPTION_USE_COMPRESSION_DEFAULT); if (useCompression) { configuration.setBoolean("mapred.compress.map.output", true); configuration.set("mapred.output.compression.type", "BLOCK"); configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); } boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT, Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT); FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration); if (overrideOutput) { fs.delete(new Path(args[1]), true); } Job job = new Job(configuration); job.setJobName("Freebase2RDFDriver"); job.setJarByClass(getClass()); FileInputFormat.addInputPath(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(Freebase2RDFMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(Freebase2RDFReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); Utils.setReducers(job, configuration, log); job.setOutputFormatClass(TextOutputFormat.class); if (log.isDebugEnabled()) Utils.log(job, log); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.knewton.mrtool.example.JsonMRExample.java
License:Apache License
/** * /*w w w . j a v a 2 s . c o m*/ * @param args * @throws IOException * @throws ClassNotFoundException * @throws InterruptedException */ public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(new Configuration()); job.setInputFormatClass(RecommendationsInputFormat.class); RecommendationsInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(RecommendationWritable.class); job.waitForCompletion(true); }
From source file:com.kse.bigdata.main.Driver.java
License:Apache License
public static void main(String[] args) throws Exception { /********************************************************************************** ** Merge the source files into one. ** /** Should change the directories of each file before executing the program ** ***********************************************************************************/ // String inputFileDirectory = "/media/bk/??/BigData_Term_Project/Debug"; // String resultFileDirectory = "/media/bk/??/BigData_Term_Project/debug.csv"; // File resultFile = new File(resultFileDirectory); // if(!resultFile.exists()) // new SourceFileMerger(inputFileDirectory, resultFileDirectory).mergeFiles(); /********************************************************************************** * Hadoop Operation./*from w ww .j a v a 2s . co m*/ * Befort Start, Check the Length of Sequence We Want to Predict. **********************************************************************************/ Configuration conf = new Configuration(); //Enable MapReduce intermediate compression as Snappy conf.setBoolean("mapred.compress.map.output", true); conf.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.SnappyCodec"); //Enable Profiling //conf.setBoolean("mapred.task.profile", true); String testPath = null; String inputPath = null; String outputPath = null; int sampleSize = 1; ArrayList<String> results = new ArrayList<String>(); for (int index = 0; index < args.length; index++) { /* * Mandatory command */ //Extract input path string from command line. if (args[index].equals("-in")) inputPath = args[index + 1]; //Extract output path string from command line. if (args[index].equals("-out")) outputPath = args[index + 1]; //Extract test data path string from command line. if (args[index].equals("-test")) testPath = args[index + 1]; /* * Optional command */ //Extract a number of neighbors. if (args[index].equals("-nn")) conf.setInt(Reduce.NUMBER_OF_NEAREAST_NEIGHBOR, Integer.parseInt(args[index + 1])); //Whether job uses normalization or not. if (args[index].equals("-norm")) conf.setBoolean(Map.NORMALIZATION, true); //Extract the number of sample size to test. if (args[index].equals("-s")) sampleSize = Integer.valueOf(args[index + 1]); //Whether job uses mean or median //[Default : mean] if (args[index].equals("-med")) conf.setBoolean(Reduce.MEDIAN, true); } String outputFileName = "part-r-00000"; SequenceSampler sampler = new SequenceSampler(testPath, sampleSize); LinkedList<Sequence> testSequences = sampler.getRandomSample(); // Test Sequence // String testSeqString = "13.591-13.674-13.778-13.892-13.958-14.049-14.153-14.185-14.169-14.092-13.905-13.702-13.438-13.187-13.0-12.914-12.868-12.766-12.62-12.433-12.279-12.142-12.063-12.025-100"; // Sequence testSeq = new Sequence(testSeqString); // LinkedList<Sequence> testSequences = new LinkedList<>(); // testSequences.add(testSeq); for (Sequence seq : testSequences) { /* ******************** Hadoop Launch *********************** */ System.out.println(seq.getTailString()); conf.set(Map.INPUT_SEQUENCE, seq.toString()); Job job = new Job(conf); job.setJarByClass(Driver.class); job.setJobName("term-project-driver"); job.setMapperClass(Map.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); // Should think another way to implement the combiner class // Current Implementation is not helpful to Job. // job.setCombinerClass(Combiner.class); //Set 1 for number of reduce task for keeping 100 most neighbors in sorted set. job.setNumReduceTasks(1); job.setReducerClass(Reduce.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.waitForCompletion(true); /* * if job finishes, get result of the job and store it in results(list). */ try { FileSystem hdfs = FileSystem.get(new Configuration()); BufferedReader fileReader = new BufferedReader( new InputStreamReader(hdfs.open(new Path(outputPath + "/" + outputFileName)))); String line; while ((line = fileReader.readLine()) != null) { results.add(seq.getSeqString() + " " + line); } fileReader.close(); hdfs.delete(new Path(outputPath), true); hdfs.close(); } catch (IOException e) { e.printStackTrace(); System.exit(1); } } /* * if all jobs finish, store results of jobs to output/result.txt file. */ String finalOutputPath = "output/result.csv"; try { FileSystem hdfs = FileSystem.get(new Configuration()); Path file = new Path(finalOutputPath); if (hdfs.exists(file)) { hdfs.delete(file, true); } OutputStream os = hdfs.create(file); PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(os, "UTF-8")); //CSV File Header printWriter.println("Actual,Predicted,MER,MAE"); printWriter.flush(); for (String result : results) { String[] tokens = result.split("\\s+"); printWriter.println(tokens[0] + "," + tokens[1] + "," + tokens[2] + "," + tokens[3]); printWriter.flush(); } printWriter.close(); hdfs.close(); } catch (IOException e) { e.printStackTrace(); System.exit(1); } }
From source file:com.linkedin.mapred.AvroUtils.java
License:Open Source License
/** * Run an avro hadoop job with job conf// w w w. j a v a2 s . c o m * @param conf * @throws Exception */ public static void runAvroJob(JobConf conf) throws Exception { Path[] inputPaths = AvroInputFormat.getInputPaths(conf); _log.info("Running hadoop job with input paths:"); for (Path inputPath : inputPaths) { _log.info(inputPath); } _log.info("Output path=" + AvroOutputFormat.getOutputPath(conf)); Job job = new Job(conf); job.setJarByClass(AvroUtils.class); job.waitForCompletion(true); }
From source file:com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs.java
License:Apache License
private TaskAttemptContext getContext(String nameOutput) throws IOException { TaskAttemptContext taskContext = taskContexts.get(nameOutput); if (taskContext != null) { return taskContext; }//from ww w .j a v a2s . c om // The following trick leverages the instantiation of a record writer via // the job thus supporting arbitrary output formats. context.getConfiguration().set("avro.mo.config.namedOutput", nameOutput); Job job = new Job(context.getConfiguration()); job.setOutputFormatClass(getNamedOutputFormatClass(context, nameOutput)); Schema keySchema = keySchemas.get(nameOutput + "_KEYSCHEMA"); Schema valSchema = valSchemas.get(nameOutput + "_VALSCHEMA"); boolean isMaponly = job.getNumReduceTasks() == 0; if (keySchema != null) { if (isMaponly) AvroJob.setMapOutputKeySchema(job, keySchema); else AvroJob.setOutputKeySchema(job, keySchema); } if (valSchema != null) { if (isMaponly) AvroJob.setMapOutputValueSchema(job, valSchema); else AvroJob.setOutputValueSchema(job, valSchema); } taskContext = new TaskAttemptContext(job.getConfiguration(), context.getTaskAttemptID()); taskContexts.put(nameOutput, taskContext); return taskContext; }
From source file:com.marklogic.hadoop.csvtoxml.CsvToXML.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJobName("ProcessCSVtoXML_job"); System.out.println("After the JobName Updates"); job.setJarByClass(CsvToXML.class); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ContentMapper.class); job.setMapOutputKeyClass(DocumentURI.class); job.setMapOutputValueClass(Text.class); job.setOutputFormatClass(ContentOutputFormat.class); System.out.println("Made it past external jar dependencies nodes"); FileInputFormat.setInputPaths(job, new Path(otherArgs[1])); conf = job.getConfiguration();//from w w w . j a v a2 s . co m conf.addResource(otherArgs[0]); System.out.println("After the conf.set"); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.memonews.hbase.hadoop.CopyColumnFamilyData.java
License:Apache License
/** * Copies all rows from source-table/source-family to * target-table/target-family/*from w w w. j a va2 s .c o m*/ * * @param args * cli-parameter * @throws Exception * when an error occurs */ public static void main(final String[] args) throws Exception { final Configuration conf = HBaseConfiguration.create(); final String[] remainingArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (remainingArgs.length != 4) { System.out.println(getUsage()); System.exit(1); return; } String sourceTableName = remainingArgs[0]; String sourceColumnFamily = remainingArgs[1]; String destinationTableName = remainingArgs[2]; String destinationColumnFamily = remainingArgs[3]; Job job = new Job(conf); job.getConfiguration().set("sourceColumnFamily", sourceColumnFamily); job.getConfiguration().set("destinationColumnFamily", destinationColumnFamily); job.setJarByClass(CopyColumnFamilyData.class); Scan scan = new Scan(); scan.addFamily(Bytes.toBytes(sourceColumnFamily)); TableMapReduceUtil.setScannerCaching(job, 10000); TableMapReduceUtil.initTableMapperJob(sourceTableName, scan, IdentityTableMapper.class, ImmutableBytesWritable.class, Result.class, job); TableMapReduceUtil.initTableReducerJob(destinationTableName, ResultToPutIdentityReducer.class, job); // determine the number of reduce tasks based on the number of splits in the source table // rather than the destination table since the destination table will generally be empty TableMapReduceUtil.setNumReduceTasks(sourceTableName, job); System.exit(job.waitForCompletion(true) ? 0 : 1); }