List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = new ArrayList<>(); Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration()); for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) { String inputName = mapperInputEntry.getKey(); MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue(); String mapperClassName = mapperInput.getMapperClassName(); Job jobCopy = new Job(job.getConfiguration()); Configuration confCopy = jobCopy.getConfiguration(); // set configuration specific for this input onto the jobCopy ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy); Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName()); Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ", mapperInput.getInputFormatClassName()); InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy); for (InputSplit split : formatSplits) { splits.add(new TaggedInputSplit(inputName, split, confCopy, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName)); }/*from www . j a v a 2 s . c o m*/ } return splits; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultiInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = new ArrayList<>(); Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration()); for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) { String inputName = mapperInputEntry.getKey(); MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue(); String mapperClassName = mapperInput.getMapperClassName(); Job jobCopy = new Job(job.getConfiguration()); Configuration confCopy = jobCopy.getConfiguration(); // set configuration specific for this input onto the jobCopy ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy); Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName()); Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ", mapperInput.getInputFormatClassName()); InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy); //some input format need a jobId to getSplits jobCopy.setJobID(new JobID(inputName, inputName.hashCode())); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a MultiInputTaggedSplit. List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy); for (InputSplit split : formatSplits) { splits.add(new MultiInputTaggedSplit(split, confCopy, inputName, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName)); }//from w w w . j a va 2 s. c om } return splits; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} and * {@link Mapper} to the list of inputs for the map-reduce job. * * @param job The {@link Job}/*from w w w . jav a2s . c o m*/ * @param namedInput name of the input * @param inputFormatClass the name of the InputFormat class to be used for this input * @param inputConfigs the configurations to be used for this input * @param mapperClass {@link Mapper} class to use for this path */ @SuppressWarnings("unchecked") public static void addInput(Job job, String namedInput, String inputFormatClass, Map<String, String> inputConfigs, Class<? extends Mapper> mapperClass) { Configuration conf = job.getConfiguration(); Map<String, MapperInput> map = getInputMap(conf); // this shouldn't happen, because it is already protected against in BasicMapReduceContext#addInput if (map.containsKey(namedInput)) { throw new IllegalArgumentException("Input already configured: " + namedInput); } map.put(namedInput, new MapperInput(inputFormatClass, inputConfigs, mapperClass)); conf.set(INPUT_CONFIGS, GSON.toJson(map)); job.setInputFormatClass(DelegatingInputFormat.class); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultipleInputsTest.java
License:Apache License
@Test public void testConfigurations() throws IOException, ClassNotFoundException { Job job = Job.getInstance(); String inputName1 = "inputName1"; String inputFormatClass1 = TextInputFormat.class.getName(); Map<String, String> inputFormatConfigs1 = ImmutableMap.of("key1", "val1", "key2", "val2"); MultipleInputs.addInput(job, inputName1, inputFormatClass1, inputFormatConfigs1, job.getMapperClass()); Map<String, MultipleInputs.MapperInput> map = MultipleInputs.getInputMap(job.getConfiguration()); Assert.assertEquals(1, map.size());/*from w w w.j a v a2 s . co m*/ Assert.assertEquals(inputName1, Iterables.getOnlyElement(map.keySet())); Assert.assertEquals(inputFormatClass1, Iterables.getOnlyElement(map.values()).getInputFormatClassName()); Assert.assertEquals(inputFormatConfigs1, Iterables.getOnlyElement(map.values()).getInputFormatConfiguration()); Assert.assertEquals(job.getMapperClass().getName(), Iterables.getOnlyElement(map.values()).getMapperClassName()); Assert.assertEquals(DelegatingInputFormat.class, job.getInputFormatClass()); // now, test with two inputs in the configuration String inputName2 = "inputName2"; String inputFormatClass2 = TextInputFormat.class.getName(); Map<String, String> inputFormatConfigs2 = ImmutableMap.of("some_key1", "some_val1", "some_key2", "some_val2"); MultipleInputs.addInput(job, inputName2, inputFormatClass2, inputFormatConfigs2, CustomMapper.class); map = MultipleInputs.getInputMap(job.getConfiguration()); Assert.assertEquals(2, map.size()); MultipleInputs.MapperInput mapperInput1 = map.get(inputName1); Assert.assertEquals(inputFormatClass1, mapperInput1.getInputFormatClassName()); Assert.assertEquals(inputFormatConfigs1, mapperInput1.getInputFormatConfiguration()); Assert.assertEquals(job.getMapperClass().getName(), mapperInput1.getMapperClassName()); MultipleInputs.MapperInput mapperInput2 = map.get(inputName2); Assert.assertEquals(inputFormatClass2, mapperInput2.getInputFormatClassName()); Assert.assertEquals(inputFormatConfigs2, mapperInput2.getInputFormatConfiguration()); Assert.assertEquals(CustomMapper.class, job.getConfiguration().getClassByName(mapperInput2.getMapperClassName())); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java
License:Apache License
/** * Adds a named output for the job.// w w w. j a v a 2 s . com * * @param job job to add the named output * @param namedOutput named output name, it has to be a word, letters * and numbers only (alphanumeric) * @param outputFormatClass name of the OutputFormat class. * @param keyClass key class * @param valueClass value class * @param outputConfigs configurations for the output */ @SuppressWarnings("unchecked") public static void addNamedOutput(Job job, String namedOutput, String outputFormatClass, Class<?> keyClass, Class<?> valueClass, Map<String, String> outputConfigs) { assertValidName(namedOutput); checkNamedOutputName(namedOutput, getNamedOutputsList(job), false); Configuration conf = job.getConfiguration(); conf.set(MULTIPLE_OUTPUTS, conf.get(MULTIPLE_OUTPUTS, "") + " " + namedOutput); conf.set(MO_PREFIX + namedOutput + FORMAT, outputFormatClass); conf.setClass(MO_PREFIX + namedOutput + KEY, keyClass, Object.class); conf.setClass(MO_PREFIX + namedOutput + VALUE, valueClass, Object.class); ConfigurationUtil.setNamedConfigurations(conf, computePrefixName(namedOutput), outputConfigs); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java
License:Apache License
static TaskAttemptContext getNamedTaskContext(TaskAttemptContext context, String namedOutput) throws IOException { Job job = getNamedJob(context, namedOutput); return new TaskAttemptContextImpl(job.getConfiguration(), context.getTaskAttemptID(), new WrappedStatusReporter(context)); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java
License:Apache License
static JobContext getNamedJobContext(JobContext context, String namedOutput) throws IOException { Job job = getNamedJob(context, namedOutput); return new JobContextImpl(job.getConfiguration(), job.getJobID()); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputs.java
License:Apache License
private static Job getNamedJob(JobContext context, String namedOutput) throws IOException { // The following trick leverages the instantiation of a record writer via // the job thus supporting arbitrary output formats. Job job = Job.getInstance(context.getConfiguration()); job.setOutputFormatClass(getNamedOutputFormatClass(context, namedOutput)); job.setOutputKeyClass(getNamedOutputKeyClass(context, namedOutput)); job.setOutputValueClass(getNamedOutputValueClass(context, namedOutput)); Configuration conf = job.getConfiguration(); Map<String, String> namedConfigurations = ConfigurationUtil .getNamedConfigurations(context.getConfiguration(), computePrefixName(namedOutput)); ConfigurationUtil.setAll(namedConfigurations, conf); return job;/*w ww. j a va 2 s .com*/ }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.output.MultipleOutputsMainOutputWrapper.java
License:Apache License
/** * Sets an OutputFormat class as the root OutputFormat for the Hadoop job. * * @param job the job on which to set the OutputFormat class * @param outputFormatClass the class to set as the root OutputFormat for the job * @param outputConfig the configuration to set for the specified OutputFormat *//* w w w . j a v a2 s .com*/ public static void setRootOutputFormat(Job job, String outputFormatClass, Map<String, String> outputConfig) { job.getConfiguration().set(ROOT_OUTPUT_FORMAT, outputFormatClass); for (Map.Entry<String, String> confEntry : outputConfig.entrySet()) { job.getConfiguration().set(confEntry.getKey(), confEntry.getValue()); } }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitionerWriterWrapper.java
License:Apache License
private TaskAttemptContext getTaskAttemptContext(TaskAttemptContext context, String newOutputName) throws IOException { Job job = new Job(context.getConfiguration()); DynamicPartitioningOutputFormat.setOutputName(job, newOutputName); // CDAP-4806 We must set this parameter in addition to calling FileOutputFormat#setOutputName, because // AvroKeyOutputFormat/AvroKeyValueOutputFormat use a different parameter for the output name than FileOutputFormat. if (isAvroOutputFormat(getFileOutputFormat(context))) { job.getConfiguration().set("avro.mo.config.namedOutput", newOutputName); }// w w w . ja v a 2 s . c o m Path jobOutputPath = DynamicPartitioningOutputFormat .createJobSpecificPath(FileOutputFormat.getOutputPath(job), context); DynamicPartitioningOutputFormat.setOutputPath(job, jobOutputPath); return new TaskAttemptContextImpl(job.getConfiguration(), context.getTaskAttemptID()); }