List of usage examples for org.apache.hadoop.mapreduce InputFormat getSplits
public abstract List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException;
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = new ArrayList<>(); Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration()); for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) { String inputName = mapperInputEntry.getKey(); MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue(); String mapperClassName = mapperInput.getMapperClassName(); Job jobCopy = new Job(job.getConfiguration()); Configuration confCopy = jobCopy.getConfiguration(); // set configuration specific for this input onto the jobCopy ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy); Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName()); Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ", mapperInput.getInputFormatClassName()); InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy); for (InputSplit split : formatSplits) { splits.add(new TaggedInputSplit(inputName, split, confCopy, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName)); }/*from w w w.j a v a 2 s . c o m*/ } return splits; }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultiInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = new ArrayList<>(); Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration()); for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) { String inputName = mapperInputEntry.getKey(); MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue(); String mapperClassName = mapperInput.getMapperClassName(); Job jobCopy = new Job(job.getConfiguration()); Configuration confCopy = jobCopy.getConfiguration(); // set configuration specific for this input onto the jobCopy ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy); Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName()); Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ", mapperInput.getInputFormatClassName()); InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy); //some input format need a jobId to getSplits jobCopy.setJobID(new JobID(inputName, inputName.hashCode())); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a MultiInputTaggedSplit. List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy); for (InputSplit split : formatSplits) { splits.add(new MultiInputTaggedSplit(split, confCopy, inputName, mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName)); }//from ww w . ja v a 2 s . co m } return splits; }
From source file:com.ambiata.ivory.operation.hadoop.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); List<InputSplit> splits = new ArrayList<InputSplit>(); Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job); Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job); Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>(); // First, build a map of InputFormats to Paths for (Entry<Path, InputFormat> entry : formatMap.entrySet()) { if (!formatPaths.containsKey(entry.getValue().getClass())) { formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>()); }/* www . j a v a 2 s. com*/ formatPaths.get(entry.getValue().getClass()).add(entry.getKey()); } for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) { Class<? extends InputFormat> formatClass = formatEntry.getKey(); InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf); List<Path> paths = formatEntry.getValue(); Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>(); // Now, for each set of paths that have a common InputFormat, build // a map of Mappers to the paths they're used for for (Path path : paths) { Class<? extends Mapper> mapperClass = mapperMap.get(path); if (!mapperPaths.containsKey(mapperClass)) { mapperPaths.put(mapperClass, new LinkedList<Path>()); } mapperPaths.get(mapperClass).add(path); } // Now each set of paths that has a common InputFormat and Mapper can // be added to the same job, and split together. for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) { paths = mapEntry.getValue(); Class<? extends Mapper> mapperClass = mapEntry.getKey(); if (mapperClass == null) { try { mapperClass = job.getMapperClass(); } catch (ClassNotFoundException e) { throw new IOException("Mapper class is not found", e); } } FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> pathSplits = format.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass)); } } } return splits; }
From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java
License:Apache License
@SuppressWarnings({ "rawtypes", "unchecked" }) private void runMap(Job job, KeyValueSorter<?, ?> sorter) throws IOException, InterruptedException, ClassNotFoundException { Configuration conf = job.getConfiguration(); InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf); List<InputSplit> splits = input.getSplits(job); int serial = 1; for (InputSplit split : splits) { TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0); Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf); if (LOG.isDebugEnabled()) { LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$ mapper.getClass().getName(), id, split.getLength())); }// w w w .ja v a 2 s.co m TaskAttemptContext context = newTaskAttemptContext(conf, id); // we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf); OutputCommitter committer = output.getOutputCommitter(context); committer.setupTask(context); boolean succeed = false; try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) { RecordWriter<?, ?> writer; if (sorter != null) { writer = new ShuffleWriter(sorter); } else { writer = output.getRecordWriter(newTaskAttemptContext(conf, id)); } try { Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split); reader.initialize(split, c); mapper.run(c); } finally { writer.close(newTaskAttemptContext(conf, id)); } doCommitTask(context, committer); succeed = true; } finally { if (succeed == false) { doAbortTask(context, committer); } } } }
From source file:com.asakusafw.runtime.stage.input.StageInputFormat.java
License:Apache License
static List<StageInputSplit> computeSplits(JobContext context) throws IOException, InterruptedException { assert context != null; List<StageInput> inputs = StageInputDriver.getInputs(context.getConfiguration()); List<StageInputSplit> cached = Cache.find(context, inputs); if (cached != null) { return cached; }/*from www. j a v a 2s . com*/ Map<FormatAndMapper, List<StageInput>> paths = groupByFormatAndMapper(inputs); Map<Class<? extends InputFormat<?, ?>>, InputFormat<?, ?>> formats = instantiateFormats(context, paths.keySet()); Job temporaryJob = JobCompatibility.newJob(context.getConfiguration()); List<StageInputSplit> results = new ArrayList<>(); for (Map.Entry<FormatAndMapper, List<StageInput>> entry : paths.entrySet()) { FormatAndMapper formatAndMapper = entry.getKey(); List<StageInput> current = entry.getValue(); InputFormat<?, ?> format = formats.get(formatAndMapper.formatClass); List<? extends InputSplit> splits; if (format instanceof FileInputFormat<?, ?>) { FileInputFormat.setInputPaths(temporaryJob, toPathArray(current)); splits = format.getSplits(temporaryJob); } else if (format instanceof BridgeInputFormat) { splits = ((BridgeInputFormat) format).getSplits(context, current); } else if (format instanceof TemporaryInputFormat<?>) { splits = ((TemporaryInputFormat<?>) format).getSplits(context, current); } else { splits = format.getSplits(temporaryJob); } assert format != null : formatAndMapper.formatClass.getName(); Class<? extends Mapper<?, ?, ?, ?>> mapper = formatAndMapper.mapperClass; for (InputSplit split : splits) { Source source = new Source(split, formatAndMapper.formatClass); StageInputSplit wrapped = new StageInputSplit(mapper, Collections.singletonList(source)); wrapped.setConf(context.getConfiguration()); results.add(wrapped); } } Cache.put(context, inputs, results); return results; }
From source file:com.cloudera.crunch.impl.mr.run.CrunchInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = Lists.newArrayList(); Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs .getFormatNodeMap(jobCopy);//w w w . j a va 2 s .co m // First, build a map of InputFormats to Paths for (Map.Entry<Class<? extends InputFormat>, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) { Class<? extends InputFormat> formatClass = entry.getKey(); InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf); for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) { Integer nodeIndex = nodeEntry.getKey(); List<Path> paths = nodeEntry.getValue(); FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> pathSplits = format.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new CrunchInputSplit(pathSplit, formatClass, nodeIndex, conf)); } } } return splits; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.DelegatingInputFormat.java
License:Apache License
@SuppressWarnings("unchecked") public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); List<InputSplit> splits = new ArrayList<InputSplit>(); Map<Path, String> formatMap = PangoolMultipleInputs.getInputFormatMap(job); Map<Path, String> mapperMap = PangoolMultipleInputs.getInputProcessorFileMap(job); for (Map.Entry<Path, String> entry : formatMap.entrySet()) { FileInputFormat.setInputPaths(jobCopy, entry.getKey()); InputFormat inputFormat = InstancesDistributor.loadInstance(conf, InputFormat.class, entry.getValue(), true);/*from w ww. j a v a2 s . c o m*/ PangoolMultipleInputs.setSpecificInputContext(jobCopy.getConfiguration(), entry.getValue()); List<InputSplit> pathSplits = inputFormat.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new TaggedInputSplit(pathSplit, conf, entry.getValue(), mapperMap.get(entry.getKey()))); } } return splits; }
From source file:com.kylinolap.job.hadoop.AbstractHadoopJob.java
License:Apache License
protected double getTotalMapInputMB() throws ClassNotFoundException, IOException, InterruptedException, JobException { if (job == null) { throw new JobException("Job is null"); }//from w w w.j a va2 s.com long mapInputBytes = 0; InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); for (InputSplit split : input.getSplits(job)) { mapInputBytes += split.getLength(); } if (mapInputBytes == 0) { throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!"); } double totalMapInputMB = (double) mapInputBytes / 1024 / 1024; return totalMapInputMB; }
From source file:com.kylinolap.job.hadoop.AbstractHadoopJob.java
License:Apache License
protected int getMapInputSplitCount() throws ClassNotFoundException, JobException, IOException, InterruptedException { if (job == null) { throw new JobException("Job is null"); }//ww w. j ava 2 s .co m InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); return input.getSplits(job).size(); }
From source file:com.linkedin.cubert.io.CubertInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); ConfigurationDiff confDiff = new ConfigurationDiff(conf); int numMultiMappers = confDiff.getNumDiffs(); List<InputSplit> splits = new ArrayList<InputSplit>(); for (int mapperIndex = 0; mapperIndex < numMultiMappers; mapperIndex++) { // reset conf to multimapper i confDiff.applyDiff(mapperIndex); // get the actual input format InputFormat<K, V> actualInputFormat = getActualInputFormat(context); List<InputSplit> actualSplits = null; // check if combined input split is requested boolean combineSplit = conf.getBoolean(CubertStrings.COMBINED_INPUT, false); if (combineSplit) { // Create CombinedFileInputFormat CombineFileInputFormat<K, V> cfif = new CombineFileInputFormat<K, V>() { @Override/*w ww . j a v a 2 s . c om*/ public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException { throw new IllegalStateException("Should not be called"); } }; // get the splits actualSplits = cfif.getSplits(context); } else { actualSplits = actualInputFormat.getSplits(context); } // embed each split in MultiMapperSplit and add to list for (InputSplit actualSplit : actualSplits) splits.add(new MultiMapperSplit(actualSplit, mapperIndex)); // undo the diff confDiff.undoDiff(mapperIndex); } return splits; }