List of usage examples for org.apache.hadoop.mapreduce Job getInputFormatClass
@SuppressWarnings("unchecked") public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException
From source file:mapReduceBasics.WordCount.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser GOP = new GenericOptionsParser(conf, args); Configuration newConf = GOP.getConfiguration(); String[] otherArgs = GOP.getRemainingArgs(); System.err.println("Best of Luck"); Job job = new Job(newConf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); Path inputPath = new Path(otherArgs[0]); Path outputPath = new Path(otherArgs[1]); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); FileSystem fs = FileSystem.get(conf); fs.delete(outputPath, true);//from w w w.jav a2s. co m // job.setNumReduceTasks(3); System.err.println("Input Format - " + job.getInputFormatClass()); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:org.apache.blur.mapreduce.lib.BlurMapReduceUtil.java
License:Apache License
/** * Add the Blur dependency jars as well as jars for any of the configured job * classes to the job configuration, so that JobClient will ship them to the * cluster and add them to the DistributedCache. *//* w ww. j a va 2 s .co m*/ public static void addDependencyJars(Job job) throws IOException { try { addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class, job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass(), DocumentVisibility.class); addAllJarsInBlurLib(job.getConfiguration()); } catch (ClassNotFoundException e) { throw new IOException(e); } }
From source file:org.apache.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final List<FileSplit> inputSplits = splitsFactory.getSplits(); return new AbstractUnaryOutputSourceOperatorNodePushable() { private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); private ContextFactory ctxFactory = new ContextFactory(); @SuppressWarnings("unchecked") @Override//from w w w . j a va2 s . c om public void initialize() throws HyracksDataException { ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { writer.open(); Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader()); Job job = confFactory.getConf(); job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx); InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); int size = inputSplits.size(); for (int i = 0; i < size; i++) { /** * read all the partitions scheduled to the current node */ if (scheduledLocations[i].equals(nodeName)) { /** * pick an unread split to read synchronize among * simultaneous partitions in the same machine */ synchronized (executed) { if (executed[i] == false) { executed[i] = true; } else { continue; } } /** * read the split */ TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i); context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); while (reader.nextKeyValue() == true) { parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString()); } } } parser.close(writer); } catch (Throwable th) { writer.fail(); throw new HyracksDataException(th); } finally { writer.close(); Thread.currentThread().setContextClassLoader(ctxCL); } } }; }
From source file:org.apache.jena.tdbloader4.partitioners.InputSampler.java
License:Apache License
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. */// w w w. j a v a 2 s. c o m @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { log.debug("writePartitionFile({},{})", job, sampler); Configuration conf = job.getConfiguration(); @SuppressWarnings("rawtypes") final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks() / 9; log.debug("Number of partitions is {} for each index", numPartitions); K[] samples = sampler.getSample(inf, job); log.info("Using " + samples.length + " samples"); writePartitionFile(samples, "GSPO", job, conf, numPartitions); writePartitionFile(samples, "GPOS", job, conf, numPartitions); writePartitionFile(samples, "GOSP", job, conf, numPartitions); writePartitionFile(samples, "SPOG", job, conf, numPartitions); writePartitionFile(samples, "POSG", job, conf, numPartitions); writePartitionFile(samples, "OSPG", job, conf, numPartitions); writePartitionFile(samples, "SPO", job, conf, numPartitions); writePartitionFile(samples, "POS", job, conf, numPartitions); writePartitionFile(samples, "OSP", job, conf, numPartitions); }
From source file:org.apache.kudu.mapreduce.KuduTableMapReduceUtil.java
License:Apache License
/** * Add the Kudu dependency jars as well as jars for any of the configured * job classes to the job configuration, so that JobClient will ship them * to the cluster and add them to the DistributedCache. *///from w ww. ja v a 2 s . c o m public static void addDependencyJars(Job job) throws IOException { addKuduDependencyJars(job.getConfiguration()); try { addDependencyJars(job.getConfiguration(), // when making changes here, consider also mapred.TableMapReduceUtil // pull job classes job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass()); } catch (ClassNotFoundException e) { throw new IOException(e); } }
From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java
License:Apache License
public static double getTotalMapInputMB(Job job) throws ClassNotFoundException, IOException, InterruptedException, JobException { if (job == null) { throw new JobException("Job is null"); }/*ww w . ja va 2s. co m*/ long mapInputBytes = 0; InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); for (InputSplit split : input.getSplits(job)) { mapInputBytes += split.getLength(); } if (mapInputBytes == 0) { throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!"); } double totalMapInputMB = (double) mapInputBytes / 1024 / 1024; return totalMapInputMB; }
From source file:org.broadinstitute.sting.gatk.hadoop.hadoopsrc.InputSampler.java
License:Apache License
/** * Write a partition file for the given job, using the Sampler provided. * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. *///from w w w. ja v a2 s .co m @SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = job.getConfiguration(); final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); int numPartitions = job.getNumReduceTasks(); K[] samples = sampler.getSample(inf, job); RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator(); Arrays.sort(samples, comparator); Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf)); FileSystem fs = dst.getFileSystem(conf); if (fs.exists(dst)) { fs.delete(dst, false); } SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class); NullWritable nullValue = NullWritable.get(); float stepSize = samples.length / (float) numPartitions; int last = -1; for (int i = 1; i < numPartitions; ++i) { int k = Math.round(stepSize * i); while (last >= k && comparator.compare(samples[last], samples[k]) == 0) { ++k; } writer.append(samples[k], nullValue); last = k; } writer.close(); }
From source file:org.cloudgraph.hbase.mapreduce.GraphMapReduceSetup.java
License:Apache License
/** * Add the HBase dependency jars as well as jars for any of the configured job * classes to the job configuration, so that JobClient will ship them to the * cluster and add them to the DistributedCache. *//*from www. j a va2 s.co m*/ public static void addDependencyJars(Job job) throws IOException { try { addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class, com.google.protobuf.Message.class, com.google.common.collect.ImmutableSet.class, org.apache.hadoop.hbase.util.Bytes.class, // one class from // hbase.jar job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass()); } catch (ClassNotFoundException e) { throw new IOException(e); } }
From source file:org.kiji.mapreduce.TestKijiBulkImportJobBuilder.java
License:Apache License
@Test public void testBuildWithHFileOutput() throws Exception { final MapReduceJob mrjob = KijiBulkImportJobBuilder.create().withConf(getConf()) .withInput(new TextMapReduceJobInput(new Path(mTempPath, "input"))) .withBulkImporter(NoopBulkImporter.class) .withOutput(new HFileMapReduceJobOutput(mTable, new Path(mTempPath, "output"), 10)).build(); final Job job = mrjob.getHadoopJob(); assertEquals(TextInputFormat.class, job.getInputFormatClass()); assertEquals(BulkImportMapper.class, job.getMapperClass()); assertEquals(NoopBulkImporter.class, job.getConfiguration().getClass(KijiConfKeys.KIJI_BULK_IMPORTER_CLASS, null)); assertEquals(IdentityReducer.class, job.getReducerClass()); assertEquals(10, job.getNumReduceTasks()); assertEquals(KijiHFileOutputFormat.class, job.getOutputFormatClass()); assertEquals(TotalOrderPartitioner.class, job.getPartitionerClass()); }
From source file:org.kiji.mapreduce.TestKijiBulkImportJobBuilder.java
License:Apache License
@Test public void testBuildWithKeyValueStore() throws Exception { final MapReduceJob mrjob = KijiBulkImportJobBuilder.create().withConf(getConf()) .withInput(new TextMapReduceJobInput(new Path(mTempPath, "input"))) .withBulkImporter(KVStoreBulkImporter.class) .withOutput(new HFileMapReduceJobOutput(mTable, new Path(mTempPath, "output"), 10)).build(); final Job job = mrjob.getHadoopJob(); // Verify that everything else is what we expected as in the previous test // (except the bulk importer class name)... assertEquals(TextInputFormat.class, job.getInputFormatClass()); assertEquals(BulkImportMapper.class, job.getMapperClass()); assertEquals(KVStoreBulkImporter.class, job.getConfiguration().getClass(KijiConfKeys.KIJI_BULK_IMPORTER_CLASS, null)); assertEquals(IdentityReducer.class, job.getReducerClass()); assertEquals(10, job.getNumReduceTasks()); assertEquals(KijiHFileOutputFormat.class, job.getOutputFormatClass()); assertEquals(TotalOrderPartitioner.class, job.getPartitionerClass()); // KeyValueStore-specific checks here. final Configuration confOut = job.getConfiguration(); assertEquals(1, confOut.getInt(KeyValueStoreConfigSerializer.CONF_KEY_VALUE_STORE_COUNT, 0)); assertEquals(EmptyKeyValueStore.class.getName(), confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0." + KeyValueStoreConfigSerializer.CONF_CLASS)); assertEquals("foostore", confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0." + KeyValueStoreConfigSerializer.CONF_NAME)); }