List of usage examples for org.apache.hadoop.mapreduce Job getInputFormatClass
@SuppressWarnings("unchecked") public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException
From source file:com.moz.fiji.mapreduce.TestFijiMapReduceJobBuilder.java
License:Apache License
@Test public void testBuildWithXmlKVStores() throws Exception { // Test that we can override default configuration KeyValueStores from an XML file. final InputStream xmlStores = Resources.openSystemResource("com.moz.fiji/mapreduce/test-kvstores.xml"); // This file needs to exist before we build the job, or else // we can't build the job; it's referenced by a key-value store that checks // for its presence. final File tmpFile = new File("/tmp/foo.seq"); if (tmpFile.createNewFile()) { // We created this temp file, we're responsible for deleting it. tmpFile.deleteOnExit();/*from w w w . ja v a 2 s . co m*/ } LOG.info("Building job..."); final FijiMapReduceJob job = FijiMapReduceJobBuilder.create().withConf(mConf) .withInput(MapReduceJobInputs.newTextMapReduceJobInput(new Path("/path/to/my/input"))) .withMapper(MyMapper.class).withReducer(MyReducer.class) .withOutput(MapReduceJobOutputs.newTextMapReduceJobOutput(new Path("/path/to/my/output"), 16)) .withStoreBindings(xmlStores).build(); xmlStores.close(); LOG.info("Verifying job configuration..."); final Job hadoopJob = job.getHadoopJob(); assertEquals(TextInputFormat.class, hadoopJob.getInputFormatClass()); assertEquals(MyMapper.class, hadoopJob.getMapperClass()); assertEquals(MyReducer.class, hadoopJob.getReducerClass()); assertEquals(16, hadoopJob.getNumReduceTasks()); assertEquals(TextOutputFormat.class, hadoopJob.getOutputFormatClass()); // KeyValueStore-specific checks here. // We override mapperMap with a SeqFileKeyValueStore. Configuration confOut = hadoopJob.getConfiguration(); assertEquals(2, confOut.getInt(KeyValueStoreConfigSerializer.CONF_KEY_VALUE_STORE_COUNT, 0)); assertEquals(SeqFileKeyValueStore.class.getName(), confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0." + KeyValueStoreConfigSerializer.CONF_CLASS)); assertEquals("mapperMap", confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0." + KeyValueStoreConfigSerializer.CONF_NAME)); assertEquals(EmptyKeyValueStore.class.getName(), confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "1." + KeyValueStoreConfigSerializer.CONF_CLASS)); assertEquals("reducerMap", confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "1." + KeyValueStoreConfigSerializer.CONF_NAME)); }
From source file:com.moz.fiji.mapreduce.TestFijiProduceJobBuilder.java
License:Apache License
@Test public void testBuildWithHFileOutput() throws ClassNotFoundException, IOException { final FijiMapReduceJob produceJob = FijiProduceJobBuilder.create().withConf(getConf()) .withInputTable(mTable.getURI()).withProducer(MyProducer.class) .withOutput(/*from w w w . ja v a2s.c o m*/ MapReduceJobOutputs.newHFileMapReduceJobOutput(mTable.getURI(), new Path("foo/bar"), 10)) .build(); // Verify that the MR Job was configured correctly. final Job job = produceJob.getHadoopJob(); assertEquals(HBaseFijiTableInputFormat.class, job.getInputFormatClass()); assertEquals(ProduceMapper.class, job.getMapperClass()); assertEquals(MyProducer.class, job.getConfiguration().getClass(FijiConfKeys.FIJI_PRODUCER_CLASS, null)); assertEquals(10, job.getNumReduceTasks()); assertEquals(FijiHFileOutputFormat.class, job.getOutputFormatClass()); }
From source file:com.savy3.nonequijoin.MapOutputSampler.java
License:Apache License
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. */// w ww. j ava2 s . c o m @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); HashMap<K, V> samples = (HashMap<K, V>) sampler.getSample(inf, job); LOG.info("Using " + samples.size() + " samples"); // write the input samples in to file <partitionfile>/mapIn Path dstOut = new Path(TotalOrderPartitioner.getPartitionFile(conf)); Path dst = new Path(dstOut, "mapIn"); FileSystem fs = dst.getFileSystem(conf); SequenceFile.Writer sampleWriter = null; for (Map.Entry<K, V> sample : samples.entrySet()) { sampleWriter = SequenceFile.createWriter(fs, conf, dst, sample.getKey().getClass(), sample.getValue().getClass()); break; } for (Map.Entry<K, V> sample : samples.entrySet()) { sampleWriter.append(sample.getKey(), sample.getValue()); } sampleWriter.close(); LOG.info("Sample Input File location " + dst.toString()); // run map reduce on the samples input runMap(job, dst); }
From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java
License:Apache License
/** * Add the dependency jars as well as jars for any of the configured * job classes to the job configuration, so that JobClient will ship them * to the cluster and add them to the DistributedCache. */// w w w. j a v a 2 s . c o m public static void addDependencyJars(Job job) throws IOException { try { addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class, com.google.protobuf.Message.class, com.google.common.collect.ImmutableSet.class, job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass()); } catch (ClassNotFoundException e) { throw new IOException(e); } }
From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java
License:Apache License
@Override public int run(String[] strings) throws Exception { Configuration conf = getConf(); String dir = conf.get(LindenJobConfig.INPUT_DIR, null); logger.info("input dir:" + dir); Path inputPath = new Path(StringUtils.unEscapeString(dir)); Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR)); String indexPath = conf.get(LindenJobConfig.INDEX_PATH); FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true);/*w w w . j av a 2 s .com*/ } if (fs.exists(new Path(indexPath))) { fs.delete(new Path(indexPath), true); } int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1); Shard[] shards = createShards(indexPath, numShards); Shard.setIndexShards(conf, shards); //empty trash; (new Trash(conf)).expunge(); Job job = Job.getInstance(conf, "linden-hadoop-indexing"); job.setJarByClass(LindenJob.class); job.setMapperClass(LindenMapper.class); job.setCombinerClass(LindenCombiner.class); job.setReducerClass(LindenReducer.class); job.setMapOutputKeyClass(Shard.class); job.setMapOutputValueClass(IntermediateForm.class); job.setOutputKeyClass(Shard.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(IndexUpdateOutputFormat.class); job.setReduceSpeculativeExecution(false); job.setNumReduceTasks(numShards); String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL); if (lindenSchemaFile == null) { throw new IOException("no schema file is found"); } logger.info("Adding schema file: " + lindenSchemaFile); job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema")); String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL); if (lindenPropertiesFile == null) { throw new IOException("no linden properties file is found"); } logger.info("Adding linden properties file: " + lindenPropertiesFile); job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties")); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); Path[] inputs = FileInputFormat.getInputPaths(job); StringBuilder buffer = new StringBuilder(inputs[0].toString()); for (int i = 1; i < inputs.length; i++) { buffer.append(","); buffer.append(inputs[i].toString()); } logger.info("mapreduce.input.dir = " + buffer.toString()); logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString()); logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks()); logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS)); logger.info("mapreduce.input.format.class = " + job.getInputFormatClass()); logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass()); logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR)); job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed"); } return 0; }
From source file:edu.uci.ics.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final List<FileSplit> inputSplits = splitsFactory.getSplits(); return new AbstractUnaryOutputSourceOperatorNodePushable() { private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); private ContextFactory ctxFactory = new ContextFactory(); @SuppressWarnings("unchecked") @Override//from ww w. jav a 2 s. c om public void initialize() throws HyracksDataException { ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader()); Job job = confFactory.getConf(); job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx); writer.open(); InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); int size = inputSplits.size(); for (int i = 0; i < size; i++) { /** * read all the partitions scheduled to the current node */ if (scheduledLocations[i].equals(nodeName)) { /** * pick an unread split to read synchronize among * simultaneous partitions in the same machine */ synchronized (executed) { if (executed[i] == false) { executed[i] = true; } else { continue; } } /** * read the split */ TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i); context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); while (reader.nextKeyValue() == true) { parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString()); } } } parser.close(writer); writer.close(); } catch (Exception e) { throw new HyracksDataException(e); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } } }; }
From source file:gr.ntua.h2rdf.inputFormat2.TableMapReduceUtil.java
License:Open Source License
/** * Add the HBase dependency jars as well as jars for any of the configured * job classes to the job configuration, so that JobClient will ship them * to the cluster and add them to the DistributedCache. *//* ww w . j a v a 2s . c om*/ public static void addDependencyJars(Job job) throws IOException { try { addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class, com.google.protobuf.Message.class, job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass()); } catch (ClassNotFoundException e) { throw new IOException(e); } }
From source file:hu.sztaki.ilab.bigdata.common.tools.InputSampler.java
License:Apache License
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. */// w w w. j a va2 s .co m @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = sampler.getSample(inf, job); LOG.info("Using " + samples.length + " samples"); RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java
License:Apache License
private static void setupPipesJob(Job job) throws IOException, ClassNotFoundException { Configuration conf = job.getConfiguration(); // default map output types to Text if (!getIsJavaMapper(conf)) { job.setMapperClass(PipesMapper.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, job.getPartitionerClass()); job.setPartitionerClass(PipesPartitioner.class); }//from www . ja v a 2s. c o m if (!getIsJavaReducer(conf)) { job.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { job.setOutputFormatClass(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class); job.setInputFormatClass(PipesNonJavaInputFormat.class); } if (avroInput != null) { if (explicitInputFormat) { conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class); } // else let the bridge fall back to the appropriate Avro IF switch (avroInput) { case K: job.setInputFormatClass(PydoopAvroInputKeyBridge.class); break; case V: job.setInputFormatClass(PydoopAvroInputValueBridge.class); break; case KV: job.setInputFormatClass(PydoopAvroInputKeyValueBridge.class); break; default: throw new IllegalArgumentException("Bad Avro input type"); } } if (avroOutput != null) { if (explicitOutputFormat) { conf.setClass(Submitter.OUTPUT_FORMAT, job.getOutputFormatClass(), OutputFormat.class); } // else let the bridge fall back to the appropriate Avro OF conf.set(props.getProperty("AVRO_OUTPUT"), avroOutput.name()); switch (avroOutput) { case K: job.setOutputFormatClass(PydoopAvroOutputKeyBridge.class); break; case V: job.setOutputFormatClass(PydoopAvroOutputValueBridge.class); break; case KV: job.setOutputFormatClass(PydoopAvroOutputKeyValueBridge.class); break; default: throw new IllegalArgumentException("Bad Avro output type"); } } String exec = getExecutable(conf); if (exec == null) { String msg = "No application program defined."; throw new IllegalArgumentException(msg); } // add default debug script only when executable is expressed as // <path>#<executable> //FIXME: this is kind of useless if the pipes program is not in c++ if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length + 1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { String msg = "Problem parsing executable URI " + exec; IOException ie = new IOException(msg); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
From source file:it.crs4.seal.recab.RecabTable.java
License:Open Source License
@Override public int run(String[] args) throws Exception { LOG.info("starting"); RecabTableOptionParser parser = new RecabTableOptionParser(); parser.parse(getConf(), args);/*w w w . ja v a2s . com*/ LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); // must be called before creating the job, since the job // *copies* the Configuration. distributeVariantsFile(parser); // Create a Job using the processed conf Job job = new Job(getConf(), "RecabTable " + parser.getInputPaths().get(0)); job.setJarByClass(RecabTable.class); job.setInputFormatClass(FormatNameMap .getInputFormat(job.getConfiguration().get(RecabTableOptionParser.INPUT_FORMAT_CONF, "sam"))); LOG.info("Using input format " + job.getInputFormatClass().getName()); // input paths for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ObservationCount.class); job.setCombinerClass(Combiner.class); job.setReducerClass(Red.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // output FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); return 0; } else { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } }