List of usage examples for org.apache.hadoop.mapreduce Job setOutputValueClass
public void setOutputValueClass(Class<?> theClass) throws IllegalStateException
From source file:com.lightboxtechnologies.nsrl.HashLoader.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 6) { System.err//from w w w .j a v a 2s . co m .println("Usage: HashLoader <mfgfile> <osfile> <prodfile> <hashfile> <outpath> <num_reducers>"); System.exit(2); } final String mfg_filename = otherArgs[0]; final String os_filename = otherArgs[1]; final String prod_filename = otherArgs[2]; final String hash_filename = otherArgs[3]; final String output_filename = otherArgs[4]; conf.set("mfg_filename", mfg_filename); conf.set("os_filename", os_filename); conf.set("prod_filename", prod_filename); conf.setLong("timestamp", System.currentTimeMillis()); SKJobFactory.addDependencies(conf); final Job job = new Job(conf, "HashLoader"); job.setJarByClass(HashLoader.class); job.setMapperClass(HashLoaderMapper.class); job.setReducerClass(KeyValueSortReducer.class); job.setNumReduceTasks(Integer.parseInt(otherArgs[5])); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(HFileOutputFormat.class); TextInputFormat.addInputPath(job, new Path(hash_filename)); HFileOutputFormat.setOutputPath(job, new Path(output_filename)); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.lightboxtechnologies.spectrum.BlockHasher.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: BlockHasher <imageID> <image> <output>"); return 2; }/*from ww w . ja va 2s . c o m*/ final String imageID = args[0]; final String image = args[1]; final String output = args[2]; Configuration conf = getConf(); final Job job = SKJobFactory.createJobFromConf(imageID, image, "BlockHasher", conf); job.setJarByClass(BlockHasher.class); job.setMapperClass(BlockHashMapper.class); // job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); // job ctor copies the Configuration we pass it, get the real one conf = job.getConfiguration(); conf.setLong("timestamp", System.currentTimeMillis()); job.setInputFormatClass(RawFileInputFormat.class); RawFileInputFormat.addInputPath(job, new Path(image)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(MD5Hash.class); FileOutputFormat.setOutputPath(job, new Path(output)); conf.setInt("mapred.job.reuse.jvm.num.tasks", -1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.ExtentsExtractor.java
License:Apache License
public static int run(String imageID, String friendlyName, String outDir) throws Exception { Job job = SKJobFactory.createJob(imageID, friendlyName, "ExtentsExtractor"); job.setJarByClass(ExtentsExtractor.class); job.setMapperClass(ExtentsExtractorMapper.class); job.setNumReduceTasks(1);//from w ww.j a v a 2 s . co m job.setReducerClass(Reducer.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(JsonWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outDir)); FsEntryHBaseInputFormat.setupJob(job, imageID); System.out.println("Spinning off ExtentsExtractor Job..."); job.waitForCompletion(true); return 0; }
From source file:com.lightboxtechnologies.spectrum.ExtractData.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 4) { System.err.println("Usage: ExtractData <imageID> <friendly_name> <extents_file> <evidence file>"); return 2; }/*from ww w . j a v a 2s . c o m*/ final String imageID = args[0]; final String friendlyName = args[1]; final String extentsPath = args[2]; final String image = args[3]; Configuration conf = getConf(); final Job job = SKJobFactory.createJobFromConf(imageID, friendlyName, "ExtractData", conf); job.setJarByClass(ExtractData.class); job.setMapperClass(ExtractDataMapper.class); job.setReducerClass(KeyValueSortReducer.class); job.setNumReduceTasks(1); // job ctor copies the Configuration we pass it, get the real one conf = job.getConfiguration(); conf.setLong("timestamp", System.currentTimeMillis()); job.setInputFormatClass(RawFileInputFormat.class); RawFileInputFormat.addInputPath(job, new Path(image)); job.setOutputFormatClass(HFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); conf.setInt("mapreduce.job.jvm.numtasks", -1); final FileSystem fs = FileSystem.get(conf); Path hfileDir = new Path("/texaspete/ev/tmp", UUID.randomUUID().toString()); hfileDir = hfileDir.makeQualified(fs); LOG.info("Hashes will be written temporarily to " + hfileDir); HFileOutputFormat.setOutputPath(job, hfileDir); final Path extp = new Path(extentsPath); final URI extents = extp.toUri(); LOG.info("extents file is " + extents); DistributedCache.addCacheFile(extents, conf); conf.set("com.lbt.extentsname", extp.getName()); // job.getConfiguration().setBoolean("mapred.task.profile", true); // job.getConfiguration().setBoolean("mapreduce.task.profile", true); HBaseTables.summon(conf, HBaseTables.HASH_TBL_B, HBaseTables.HASH_COLFAM_B); HBaseTables.summon(conf, HBaseTables.ENTRIES_TBL_B, HBaseTables.ENTRIES_COLFAM_B); final boolean result = job.waitForCompletion(true); if (result) { LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); HBaseConfiguration.addHbaseResources(conf); loader.setConf(conf); LOG.info("Loading hashes into hbase"); chmodR(fs, hfileDir); loader.doBulkLoad(hfileDir, new HTable(conf, HBaseTables.HASH_TBL_B)); // result = fs.delete(hfileDir, true); } return result ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.FolderCount.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: FolderCount <table> <outpath>"); System.exit(2);/*from ww w.j a v a 2 s . c o m*/ } final Job job = new Job(conf, "FolderCount"); job.setJarByClass(FolderCount.class); job.setMapperClass(FolderCountMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setNumReduceTasks(1); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); final Scan scan = new Scan(); scan.addFamily(HBaseTables.ENTRIES_COLFAM_B); job.getConfiguration().set(TableInputFormat.INPUT_TABLE, otherArgs[0]); job.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan)); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.lightboxtechnologies.spectrum.MRCoffeeJob.java
License:Apache License
public static int run(String imageID, String outpath, String[] command, Configuration conf) throws ClassNotFoundException, DecoderException, IOException, InterruptedException { conf.setStrings("command", command); conf.setLong("timestamp", System.currentTimeMillis()); final Job job = new Job(conf, "MRCoffeeJob"); job.setJarByClass(MRCoffeeJob.class); job.setMapperClass(MRCoffeeMapper.class); // job.setReducerClass(KeyValueSortReducer.class); // job.setNumReduceTasks(1); job.setNumReduceTasks(0);//from ww w . ja v a 2s . co m FsEntryHBaseInputFormat.setupJob(job, imageID); job.setInputFormatClass(FsEntryHBaseInputFormat.class); job.setOutputKeyClass(ImmutableHexWritable.class); // job.setOutputValueClass(KeyValue.class); job.setOutputValueClass(JsonWritable.class); // job.setOutputFormatClass(HFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // HFileOutputFormat.setOutputPath(job, new Path(outpath)); TextOutputFormat.setOutputPath(job, new Path(outpath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.PythonJob.java
License:Apache License
public static int run(String imageID, String friendlyName, String outpath, String pymap, String pyred, String format, Configuration conf) throws Exception { if (conf == null) { conf = HBaseConfiguration.create(); }// www . j a va 2s . c om final Job job = SKJobFactory.createJobFromConf(imageID, friendlyName, "PythonJob", conf); job.setJarByClass(PythonJob.class); job.setMapperClass(PythonMapper.class); PyEngine py = new PyEngine(); configPyTask(job, py, "map", pymap); job.setMapOutputKeyClass(py.getKeyClass()); job.setMapOutputValueClass(py.getValueClass()); int numReduces = 1; job.setOutputKeyClass(py.getKeyClass()); job.setOutputValueClass(py.getValueClass()); if (pyred.equals("none")) { numReduces = 0; } else if (pyred.equals("identity")) { job.setReducerClass(Reducer.class); job.setOutputKeyClass(py.getKeyClass()); job.setOutputValueClass(py.getValueClass()); } else if (pyred.equals("LongSumReducer")) { job.setReducerClass(LongSumReducer.class); job.setCombinerClass(LongSumReducer.class); } else { job.setReducerClass(PythonReducer.class); configPyTask(job, py, "reduce", pyred); job.setOutputKeyClass(py.getKeyClass()); job.setOutputValueClass(py.getValueClass()); } job.setNumReduceTasks(numReduces); // it is possible to run over a flat json file... // String input = otherArgs[0]; // if (input.endsWith(".json") == true) { // job.setInputFormatClass(FsEntryJsonInputFormat.class); // FsEntryJsonInputFormat.addInputPath(job, new Path(input)); // } // else { FsEntryHBaseInputFormat.setupJob(job, imageID); job.setInputFormatClass(FsEntryHBaseInputFormat.class); if (format != null && format.equals("SequenceFileOutputFormat")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } else { job.setOutputFormatClass(TextOutputFormat.class); } FileOutputFormat.setOutputPath(job, new Path(outpath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.SequenceFileExport.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); String imageID;//from w ww.j a v a 2 s .c o m String outpath; String friendlyname; final Set<String> exts = new HashSet<String>(); if ("-f".equals(otherArgs[0])) { if (otherArgs.length != 4) { die(); } // load extensions from file final Path extpath = new Path(otherArgs[1]); InputStream in = null; try { in = extpath.getFileSystem(conf).open(extpath); Reader r = null; try { r = new InputStreamReader(in); BufferedReader br = null; try { br = new BufferedReader(r); String line; while ((line = br.readLine()) != null) { exts.add(line.trim().toLowerCase()); } br.close(); } finally { IOUtils.closeQuietly(br); } r.close(); } finally { IOUtils.closeQuietly(r); } in.close(); } finally { IOUtils.closeQuietly(in); } imageID = otherArgs[2]; friendlyname = otherArgs[3]; outpath = otherArgs[4]; } else { if (otherArgs.length < 3) { die(); } // read extensions from trailing args imageID = otherArgs[0]; friendlyname = otherArgs[1]; outpath = otherArgs[2]; // lowercase all file extensions for (int i = 2; i < otherArgs.length; ++i) { exts.add(otherArgs[i].toLowerCase()); } } conf.setStrings("extensions", exts.toArray(new String[exts.size()])); final Job job = SKJobFactory.createJobFromConf(imageID, friendlyname, "SequenceFileExport", conf); job.setJarByClass(SequenceFileExport.class); job.setMapperClass(SequenceFileExportMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(MapWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); FsEntryHBaseInputFormat.setupJob(job, imageID); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setOutputPath(job, new Path(outpath)); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.linkedin.cubert.io.rubix.RubixStorage.java
License:Open Source License
@Override public void prepareOutput(Job job, Configuration conf, JsonNode params, BlockSchema schema, Path path) { Class<?> tupleClass = TupleFactory.getInstance().newTuple().getClass(); job.setOutputKeyClass(tupleClass);/*from ww w . j a va 2 s. co m*/ job.setOutputValueClass(tupleClass); job.setOutputFormatClass(RubixOutputFormat.class); if (params.has("compact")) conf.setBoolean(CubertStrings.USE_COMPACT_SERIALIZATION, Boolean.parseBoolean(JsonUtils.getText(params, "compact"))); }
From source file:com.linkedin.hadoop.example.WordCountCounters.java
License:Apache License
/** * Azkaban will look for a method named `run` to start your job. Use this method to setup all the * Hadoop-related configuration for your job and submit it. * * @throws Exception If there is an exception during the configuration or submission of your job *///from w ww. j a v a 2 s . com public void run() throws Exception { _logger.info(String.format("Configuring job for the class %s", getClass().getSimpleName())); Job job = Job.getInstance(getConf()); job.setJarByClass(WordCountJob.class); job.setJobName(_name); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountCombiner.class); job.setReducerClass(WordCountReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); String inputPath = _properties.getProperty("input.path"); String outputPath = _properties.getProperty("output.path"); boolean forceOverwrite = Boolean.parseBoolean(_properties.getProperty("force.output.overwrite", "false")); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); // Before we submit the job, remove the old the output directory if (forceOverwrite) { FileSystem fs = FileSystem.get(job.getConfiguration()); fs.delete(FileOutputFormat.getOutputPath(job), true); } // Since we have Kerberos enabled at LinkedIn, we must add the token to our configuration. If // you don't use Kerberos security for your Hadoop cluster, you don't need this code. if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } // Submit the job for execution _logger.info(String.format("About to submit the job named %s", _name)); boolean succeeded = job.waitForCompletion(true); // Before we return, display our custom counters for the job in the Azkaban logs long inputWords = job.getCounters().findCounter(WordCountCounters.INPUT_WORDS).getValue(); _logger.info(String.format("Read a total of %d input words", inputWords)); // Azkaban will not realize the Hadoop job failed unless you specifically throw an exception if (!succeeded) { throw new Exception(String.format("Azkaban job %s failed", _name)); } }