List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass
public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException
From source file:com.lightboxtechnologies.spectrum.MRCoffeeJob.java
License:Apache License
public static int run(String imageID, String outpath, String[] command, Configuration conf) throws ClassNotFoundException, DecoderException, IOException, InterruptedException { conf.setStrings("command", command); conf.setLong("timestamp", System.currentTimeMillis()); final Job job = new Job(conf, "MRCoffeeJob"); job.setJarByClass(MRCoffeeJob.class); job.setMapperClass(MRCoffeeMapper.class); // job.setReducerClass(KeyValueSortReducer.class); // job.setNumReduceTasks(1); job.setNumReduceTasks(0);// w w w .j a va 2 s .c o m FsEntryHBaseInputFormat.setupJob(job, imageID); job.setInputFormatClass(FsEntryHBaseInputFormat.class); job.setOutputKeyClass(ImmutableHexWritable.class); // job.setOutputValueClass(KeyValue.class); job.setOutputValueClass(JsonWritable.class); // job.setOutputFormatClass(HFileOutputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); // HFileOutputFormat.setOutputPath(job, new Path(outpath)); TextOutputFormat.setOutputPath(job, new Path(outpath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.PythonJob.java
License:Apache License
public static int run(String imageID, String friendlyName, String outpath, String pymap, String pyred, String format, Configuration conf) throws Exception { if (conf == null) { conf = HBaseConfiguration.create(); }/*from w w w . j av a 2 s . c o m*/ final Job job = SKJobFactory.createJobFromConf(imageID, friendlyName, "PythonJob", conf); job.setJarByClass(PythonJob.class); job.setMapperClass(PythonMapper.class); PyEngine py = new PyEngine(); configPyTask(job, py, "map", pymap); job.setMapOutputKeyClass(py.getKeyClass()); job.setMapOutputValueClass(py.getValueClass()); int numReduces = 1; job.setOutputKeyClass(py.getKeyClass()); job.setOutputValueClass(py.getValueClass()); if (pyred.equals("none")) { numReduces = 0; } else if (pyred.equals("identity")) { job.setReducerClass(Reducer.class); job.setOutputKeyClass(py.getKeyClass()); job.setOutputValueClass(py.getValueClass()); } else if (pyred.equals("LongSumReducer")) { job.setReducerClass(LongSumReducer.class); job.setCombinerClass(LongSumReducer.class); } else { job.setReducerClass(PythonReducer.class); configPyTask(job, py, "reduce", pyred); job.setOutputKeyClass(py.getKeyClass()); job.setOutputValueClass(py.getValueClass()); } job.setNumReduceTasks(numReduces); // it is possible to run over a flat json file... // String input = otherArgs[0]; // if (input.endsWith(".json") == true) { // job.setInputFormatClass(FsEntryJsonInputFormat.class); // FsEntryJsonInputFormat.addInputPath(job, new Path(input)); // } // else { FsEntryHBaseInputFormat.setupJob(job, imageID); job.setInputFormatClass(FsEntryHBaseInputFormat.class); if (format != null && format.equals("SequenceFileOutputFormat")) { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } else { job.setOutputFormatClass(TextOutputFormat.class); } FileOutputFormat.setOutputPath(job, new Path(outpath)); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.SequenceFileExport.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); String imageID;//ww w .ja v a 2s .co m String outpath; String friendlyname; final Set<String> exts = new HashSet<String>(); if ("-f".equals(otherArgs[0])) { if (otherArgs.length != 4) { die(); } // load extensions from file final Path extpath = new Path(otherArgs[1]); InputStream in = null; try { in = extpath.getFileSystem(conf).open(extpath); Reader r = null; try { r = new InputStreamReader(in); BufferedReader br = null; try { br = new BufferedReader(r); String line; while ((line = br.readLine()) != null) { exts.add(line.trim().toLowerCase()); } br.close(); } finally { IOUtils.closeQuietly(br); } r.close(); } finally { IOUtils.closeQuietly(r); } in.close(); } finally { IOUtils.closeQuietly(in); } imageID = otherArgs[2]; friendlyname = otherArgs[3]; outpath = otherArgs[4]; } else { if (otherArgs.length < 3) { die(); } // read extensions from trailing args imageID = otherArgs[0]; friendlyname = otherArgs[1]; outpath = otherArgs[2]; // lowercase all file extensions for (int i = 2; i < otherArgs.length; ++i) { exts.add(otherArgs[i].toLowerCase()); } } conf.setStrings("extensions", exts.toArray(new String[exts.size()])); final Job job = SKJobFactory.createJobFromConf(imageID, friendlyname, "SequenceFileExport", conf); job.setJarByClass(SequenceFileExport.class); job.setMapperClass(SequenceFileExportMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(MapWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); FsEntryHBaseInputFormat.setupJob(job, imageID); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setOutputPath(job, new Path(outpath)); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.linkedin.cubert.io.avro.AvroStorage.java
License:Open Source License
@Override public void prepareInput(Job job, Configuration conf, JsonNode params, List<Path> paths) throws IOException { Schema avroSchema = AvroUtils.getSchema(conf, paths.get(0)); // set the schema for this index conf.set("cubert.avro.input.schema", avroSchema.toString()); if (params.has("unsplittable") && Boolean.parseBoolean(params.get("unsplittable").getTextValue())) conf.set("cubert.avro.input.unsplittable", "true"); else/* w w w . jav a2 s . c o m*/ conf.set("cubert.avro.input.unsplittable", "false"); job.setInputFormatClass(PigAvroInputFormatAdaptor.class); }
From source file:com.linkedin.cubert.io.rubix.RubixStorage.java
License:Open Source License
@Override public void prepareInput(Job job, Configuration conf, JsonNode params, List<Path> paths) throws IOException { job.setInputFormatClass(RubixInputFormat.class); }
From source file:com.linkedin.cubert.io.text.TextStorage.java
License:Open Source License
@Override public void prepareInput(Job job, Configuration conf, JsonNode params, List<Path> paths) throws IOException { if (params.has("separator")) { conf.set(CubertStrings.TEXT_OUTPUT_SEPARATOR, JsonUtils.getText(params, "separator")); }/*from w w w . j a v a2 s . c o m*/ job.setInputFormatClass(PigTextInputFormat.class); }
From source file:com.linkedin.cubert.io.virtual.VirtualStorage.java
License:Open Source License
@Override public void prepareInput(Job job, Configuration conf, JsonNode params, List<Path> paths) throws IOException { if (params.has("mappers")) { conf.set("mappers", JsonUtils.getText(params, "mappers")); }// w w w. j av a 2 s. co m job.setInputFormatClass(VirtualInputFormat.class); }
From source file:com.linkedin.hadoop.example.WordCountCounters.java
License:Apache License
/** * Azkaban will look for a method named `run` to start your job. Use this method to setup all the * Hadoop-related configuration for your job and submit it. * * @throws Exception If there is an exception during the configuration or submission of your job *//* w ww. ja v a2s .com*/ public void run() throws Exception { _logger.info(String.format("Configuring job for the class %s", getClass().getSimpleName())); Job job = Job.getInstance(getConf()); job.setJarByClass(WordCountJob.class); job.setJobName(_name); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountCombiner.class); job.setReducerClass(WordCountReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); String inputPath = _properties.getProperty("input.path"); String outputPath = _properties.getProperty("output.path"); boolean forceOverwrite = Boolean.parseBoolean(_properties.getProperty("force.output.overwrite", "false")); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); // Before we submit the job, remove the old the output directory if (forceOverwrite) { FileSystem fs = FileSystem.get(job.getConfiguration()); fs.delete(FileOutputFormat.getOutputPath(job), true); } // Since we have Kerberos enabled at LinkedIn, we must add the token to our configuration. If // you don't use Kerberos security for your Hadoop cluster, you don't need this code. if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } // Submit the job for execution _logger.info(String.format("About to submit the job named %s", _name)); boolean succeeded = job.waitForCompletion(true); // Before we return, display our custom counters for the job in the Azkaban logs long inputWords = job.getCounters().findCounter(WordCountCounters.INPUT_WORDS).getValue(); _logger.info(String.format("Read a total of %d input words", inputWords)); // Azkaban will not realize the Hadoop job failed unless you specifically throw an exception if (!succeeded) { throw new Exception(String.format("Azkaban job %s failed", _name)); } }
From source file:com.linkedin.oneclick.wordcount.WordCount.java
License:Apache License
public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "Word Count"); job.setJarByClass(WordCount.class); String workDirectory = args.length >= 1 ? args[0] : "wordcount"; Path input = new Path(workDirectory, "input.txt"); FileSystem fs = input.getFileSystem(conf); fs.mkdirs(input.getParent());/* ww w . j a v a2s .com*/ copy(resourceInputStream(getClass().getResource("/onegin.txt")), createOutputStream(conf, input), conf); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(WordCountMapper.class); FileInputFormat.addInputPath(job, input); job.setCombinerClass(WordCountReducer.class); job.setReducerClass(WordCountReducer.class); job.setOutputFormatClass(TextOutputFormat.class); Path output = clean(conf, new Path(workDirectory, "wordcount")); FileOutputFormat.setOutputPath(job, output); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); return job.waitForCompletion(true) ? 0 : -1; }
From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java
License:Apache License
public void run() throws Exception { LOGGER.info("Starting {}", getClass().getSimpleName()); FileSystem fs = FileSystem.get(getConf()); Path inputPathPattern = new Path(_inputSegmentDir); if (fs.exists(new Path(_stagingDir))) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(new Path(_stagingDir), true); }/*from www . ja v a 2 s . com*/ fs.mkdirs(new Path(_stagingDir)); fs.mkdirs(new Path(_stagingDir + "/input/")); if (fs.exists(new Path(_outputDir))) { LOGGER.warn("Found the output folder, deleting it"); fs.delete(new Path(_outputDir), true); } fs.mkdirs(new Path(_outputDir)); List<FileStatus> inputDataFiles = new ArrayList<FileStatus>(); FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern); for (FileStatus fileStatus : fileStatusArr) { inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath())); } for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((_stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationJob.class); job.setJobName(_jobName); job.setMapperClass(HadoopSegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema)); job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : _properties.keySet()) { job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString())); } if (_depsJarPath != null && _depsJarPath.length() > 0) { addDepsJarToDistributedCache(new Path(_depsJarPath), job); } // Submit the job for execution. job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", _stagingDir); fs.delete(new Path(_stagingDir), true); }