List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); List<InputSplit> splits = new ArrayList<InputSplit>(); Map<Path, String> formatMap = PangoolMultipleInputs.getInputFormatMap(job); Map<Path, String> mapperMap = PangoolMultipleInputs.getInputProcessorFileMap(job); for (Map.Entry<Path, String> entry : formatMap.entrySet()) { FileInputFormat.setInputPaths(jobCopy, entry.getKey()); InputFormat inputFormat = InstancesDistributor.loadInstance(conf, InputFormat.class, entry.getValue(), true);//from w ww . java 2 s . c om PangoolMultipleInputs.setSpecificInputContext(jobCopy.getConfiguration(), entry.getValue()); List<InputSplit> pathSplits = inputFormat.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, entry.getValue(), mapperMap.get(entry.getKey()))); } } return splits; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.PangoolMultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} and {@link Mapper} to the list of inputs for the map-reduce * job. Returns the instance files created. * /* w ww . ja va 2s.c om*/ * @param job * The {@link Job} * @param path * {@link Path} to be added to the list of inputs for the job * @param inputFormat * {@link InputFormat} class to use for this path * @param mapperInstance * {@link Mapper} instance to use * @throws IOException * @throws FileNotFoundException */ public static Set<String> addInputPath(Job job, Path path, InputFormat inputFormat, Mapper mapperInstance, Map<String, String> specificContext) throws FileNotFoundException, IOException { Set<String> instanceFiles = new HashSet<String>(); // Serialize the Mapper instance String uniqueNameMapper = UUID.randomUUID().toString() + '.' + "mapper.dat"; try { InstancesDistributor.distribute(mapperInstance, uniqueNameMapper, job.getConfiguration()); instanceFiles.add(uniqueNameMapper); } catch (URISyntaxException e) { throw new IOException(e); } // Serialize the Input Format String uniqueNameInputFormat = UUID.randomUUID().toString() + '.' + "inputFormat.dat"; try { InstancesDistributor.distribute(inputFormat, uniqueNameInputFormat, job.getConfiguration()); instanceFiles.add(uniqueNameInputFormat); } catch (URISyntaxException e) { throw new IOException(e); } for (Map.Entry<String, String> contextKeyValue : specificContext.entrySet()) { PangoolMultipleInputs.addInputContext(job, uniqueNameInputFormat, contextKeyValue.getKey(), contextKeyValue.getValue()); } addInputPath(job, path, uniqueNameInputFormat); Configuration conf = job.getConfiguration(); String mapperMapping = path.toString() + ";" + uniqueNameMapper; String mappers = conf.get(PANGOOL_INPUT_DIR_MAPPERS_CONF); conf.set(PANGOOL_INPUT_DIR_MAPPERS_CONF, mappers == null ? mapperMapping : mappers + "," + mapperMapping); job.setMapperClass(DelegatingMapper.class); return instanceFiles; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.PangoolMultipleInputs.java
License:Apache License
private static void addInputPath(Job job, Path path, String inputFormatInstance) { /*/*from w ww .ja v a 2s .co m*/ * Only internal -> not allowed to add inputs without associated InputProcessor files */ String inputFormatMapping = path.toString() + ";" + inputFormatInstance; Configuration conf = job.getConfiguration(); String inputFormats = conf.get(PANGOOL_INPUT_DIR_FORMATS_CONF); conf.set(PANGOOL_INPUT_DIR_FORMATS_CONF, inputFormats == null ? inputFormatMapping : inputFormats + "," + inputFormatMapping); job.setInputFormatClass(DelegatingInputFormat.class); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.PangoolMultipleInputs.java
License:Apache License
/** * Specific (key, value) configurations for each Input. Some Input Formats read specific configuration values and act * based on them./*from w w w. ja v a2 s . c o m*/ */ public static void addInputContext(Job job, String inputName, String key, String value) { // Check that this named output has been configured before Configuration conf = job.getConfiguration(); // Add specific configuration conf.set(MI_PREFIX + inputName + CONF + "." + key, value); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
/** * Adds a named output for the job. Returns the instance file that has been created. *//*from www. j a v a 2 s . c o m*/ public static String addNamedOutput(Job job, String namedOutput, OutputFormat outputFormat, Class<?> keyClass, Class<?> valueClass) throws FileNotFoundException, IOException, URISyntaxException { checkNamedOutputName(job, namedOutput, true); Configuration conf = job.getConfiguration(); String uniqueName = UUID.randomUUID().toString() + '.' + "out-format.dat"; InstancesDistributor.distribute(outputFormat, uniqueName, conf); conf.set(MULTIPLE_OUTPUTS, conf.get(MULTIPLE_OUTPUTS, "") + " " + namedOutput); conf.set(MO_PREFIX + namedOutput + FORMAT_INSTANCE_FILE, uniqueName); conf.setClass(MO_PREFIX + namedOutput + KEY, keyClass, Object.class); conf.setClass(MO_PREFIX + namedOutput + VALUE, valueClass, Object.class); return uniqueName; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
/** * Added this method for allowing specific (key, value) configurations for each Output. Some Output Formats read * specific configuration values and act based on them. * /* ww w . j av a 2 s .c o m*/ * @param namedOutput * @param key * @param value */ public static void addNamedOutputContext(Job job, String namedOutput, String key, String value) { // Check that this named output has been configured before Configuration conf = job.getConfiguration(); // Add specific configuration conf.set(MO_PREFIX + namedOutput + CONF + "." + key, value); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
/** * Iterates over the Configuration and sets the specific context found for the namedOutput in the Job instance. * Package-access so it can be unit tested. The specific context is configured in method this. * {@link #addNamedOutputContext(Job, String, String, String)}. * //from ww w. java 2 s . c o m * @param conf * The configuration that may contain specific context for the named output * @param job * The Job where we will set the specific context * @param namedOutput * The named output */ public static void setSpecificNamedOutputContext(Configuration conf, Job job, String namedOutput) { for (Map.Entry<String, String> entries : conf) { String confKey = entries.getKey(); String confValue = entries.getValue(); if (confKey.startsWith(MO_PREFIX + namedOutput + CONF)) { // Specific context key, value found String contextKey = confKey.substring((MO_PREFIX + namedOutput + CONF + ".").length(), confKey.length()); job.getConfiguration().set(contextKey, confValue); } } }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
/** * Enables or disables counters for the named outputs. * /*from w ww . ja v a 2 s . c om*/ * The counters group is the {@link PangoolMultipleOutputs} class name. The names of the counters are the same as the * named outputs. These counters count the number records written to each output name. By default these counters are * disabled. * * @param job * job to enable counters * @param enabled * indicates if the counters will be enabled or not. */ public static void setCountersEnabled(Job job, boolean enabled) { job.getConfiguration().setBoolean(COUNTERS_ENABLED, enabled); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.PangoolMultipleOutputs.java
License:Apache License
public synchronized RecordWriter getRecordWriter(String baseFileName) throws IOException, InterruptedException { // Look for record-writer in the cache OutputContext context = outputContexts.get(baseFileName); // If not in cache, create a new one if (context == null) { context = new OutputContext(); OutputFormat mainOutputFormat;//from ww w . j av a2 s.com try { mainOutputFormat = ((OutputFormat) ReflectionUtils.newInstance(this.context.getOutputFormatClass(), this.context.getConfiguration())); } catch (ClassNotFoundException e1) { throw new RuntimeException(e1); } ProxyOutputCommitter baseOutputCommitter = ((ProxyOutputCommitter) mainOutputFormat .getOutputCommitter(this.context)); // The trick is to create a new Job for each output Job job = new Job(this.context.getConfiguration()); job.setOutputKeyClass(getNamedOutputKeyClass(this.context, baseFileName)); job.setOutputValueClass(getNamedOutputValueClass(this.context, baseFileName)); // Check possible specific context for the output setSpecificNamedOutputContext(this.context.getConfiguration(), job, baseFileName); TaskAttemptContext taskContext; try { taskContext = TaskAttemptContextFactory.get(job.getConfiguration(), this.context.getTaskAttemptID()); } catch (Exception e) { throw new IOException(e); } // First we change the output dir for the new OutputFormat that we will // create // We put it inside the main output work path -> in case the Job fails, // everything will be discarded taskContext.getConfiguration().set("mapred.output.dir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); // This is for Hadoop 2.0 : taskContext.getConfiguration().set("mapreduce.output.fileoutputformat.outputdir", baseOutputCommitter.getBaseDir() + "/" + baseFileName); context.taskAttemptContext = taskContext; // Load the OutputFormat instance OutputFormat outputFormat = InstancesDistributor.loadInstance( context.taskAttemptContext.getConfiguration(), OutputFormat.class, getNamedOutputFormatInstanceFile(this.context, baseFileName), true); // We have to create a JobContext for meeting the contract of the // OutputFormat JobContext jobContext; try { jobContext = JobContextFactory.get(taskContext.getConfiguration(), taskContext.getJobID()); } catch (Exception e) { throw new IOException(e); } context.jobContext = jobContext; // The contract of the OutputFormat is to check the output specs outputFormat.checkOutputSpecs(jobContext); // We get the output committer so we can call it later context.outputCommitter = outputFormat.getOutputCommitter(taskContext); // Save the RecordWriter to cache it context.recordWriter = outputFormat.getRecordWriter(taskContext); // if counters are enabled, wrap the writer with context // to increment counters if (countersEnabled) { context.recordWriter = new RecordWriterWithCounter(context.recordWriter, baseFileName, this.context); } outputContexts.put(baseFileName, context); } return context.recordWriter; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestPangoolMultipleOutputs.java
License:Apache License
@Test public void testSpecificContext() throws IOException { // Test that we can add specific key, value configurations for each output Configuration conf = new Configuration(); Job job = new Job(conf); PangoolMultipleOutputs.addNamedOutputContext(job, "foo", "my.context.property", "myValue"); PangoolMultipleOutputs.setSpecificNamedOutputContext(job.getConfiguration(), job, "foo"); Assert.assertEquals("myValue", job.getConfiguration().get("my.context.property")); }