Example usage for org.apache.hadoop.mapreduce InputFormat getSplits

List of usage examples for org.apache.hadoop.mapreduce InputFormat getSplits

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputFormat getSplits.

Prototype

public abstract List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException;

Source Link

Document

Logically split the set of input files for the job.

Usage

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = new ArrayList<>();
    Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());

    for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
        String inputName = mapperInputEntry.getKey();
        MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
        String mapperClassName = mapperInput.getMapperClassName();
        Job jobCopy = new Job(job.getConfiguration());
        Configuration confCopy = jobCopy.getConfiguration();

        // set configuration specific for this input onto the jobCopy
        ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);

        Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
        Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ",
                mapperInput.getInputFormatClassName());
        InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);

        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a TaggedInputSplit.
        List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit split : formatSplits) {
            splits.add(new TaggedInputSplit(inputName, split, confCopy,
                    mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
        }/*from  w  w  w.j  a v  a 2  s  . c o m*/
    }
    return splits;
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultiInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = new ArrayList<>();
    Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());

    for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
        String inputName = mapperInputEntry.getKey();
        MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
        String mapperClassName = mapperInput.getMapperClassName();
        Job jobCopy = new Job(job.getConfiguration());
        Configuration confCopy = jobCopy.getConfiguration();

        // set configuration specific for this input onto the jobCopy
        ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);

        Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
        Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ",
                mapperInput.getInputFormatClassName());

        InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);
        //some input format need a jobId to getSplits
        jobCopy.setJobID(new JobID(inputName, inputName.hashCode()));

        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a MultiInputTaggedSplit.
        List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit split : formatSplits) {
            splits.add(new MultiInputTaggedSplit(split, confCopy, inputName,
                    mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
        }//from  ww w  . ja  v a  2  s  .  co  m
    }
    return splits;
}

From source file:com.ambiata.ivory.operation.hadoop.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job);
    Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job);
    Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();

    // First, build a map of InputFormats to Paths
    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
        if (!formatPaths.containsKey(entry.getValue().getClass())) {
            formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
        }/* www . j a  v a  2 s.  com*/

        formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
    }

    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
        Class<? extends InputFormat> formatClass = formatEntry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        List<Path> paths = formatEntry.getValue();

        Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();

        // Now, for each set of paths that have a common InputFormat, build
        // a map of Mappers to the paths they're used for
        for (Path path : paths) {
            Class<? extends Mapper> mapperClass = mapperMap.get(path);
            if (!mapperPaths.containsKey(mapperClass)) {
                mapperPaths.put(mapperClass, new LinkedList<Path>());
            }

            mapperPaths.get(mapperClass).add(path);
        }

        // Now each set of paths that has a common InputFormat and Mapper can
        // be added to the same job, and split together.
        for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
            paths = mapEntry.getValue();
            Class<? extends Mapper> mapperClass = mapEntry.getKey();

            if (mapperClass == null) {
                try {
                    mapperClass = job.getMapperClass();
                } catch (ClassNotFoundException e) {
                    throw new IOException("Mapper class is not found", e);
                }
            }

            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
            }
        }
    }

    return splits;
}

From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
private void runMap(Job job, KeyValueSorter<?, ?> sorter)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    List<InputSplit> splits = input.getSplits(job);
    int serial = 1;
    for (InputSplit split : splits) {
        TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0);
        Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf);
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$
                    mapper.getClass().getName(), id, split.getLength()));
        }// w  w w  .ja  v  a  2  s.co  m
        TaskAttemptContext context = newTaskAttemptContext(conf, id);
        // we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached
        OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
        OutputCommitter committer = output.getOutputCommitter(context);
        committer.setupTask(context);
        boolean succeed = false;
        try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) {
            RecordWriter<?, ?> writer;
            if (sorter != null) {
                writer = new ShuffleWriter(sorter);
            } else {
                writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
            }
            try {
                Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split);
                reader.initialize(split, c);
                mapper.run(c);
            } finally {
                writer.close(newTaskAttemptContext(conf, id));
            }
            doCommitTask(context, committer);
            succeed = true;
        } finally {
            if (succeed == false) {
                doAbortTask(context, committer);
            }
        }
    }
}

From source file:com.asakusafw.runtime.stage.input.StageInputFormat.java

License:Apache License

static List<StageInputSplit> computeSplits(JobContext context) throws IOException, InterruptedException {
    assert context != null;
    List<StageInput> inputs = StageInputDriver.getInputs(context.getConfiguration());
    List<StageInputSplit> cached = Cache.find(context, inputs);
    if (cached != null) {
        return cached;
    }/*from www.  j  a v  a  2s  . com*/
    Map<FormatAndMapper, List<StageInput>> paths = groupByFormatAndMapper(inputs);
    Map<Class<? extends InputFormat<?, ?>>, InputFormat<?, ?>> formats = instantiateFormats(context,
            paths.keySet());
    Job temporaryJob = JobCompatibility.newJob(context.getConfiguration());
    List<StageInputSplit> results = new ArrayList<>();
    for (Map.Entry<FormatAndMapper, List<StageInput>> entry : paths.entrySet()) {
        FormatAndMapper formatAndMapper = entry.getKey();
        List<StageInput> current = entry.getValue();
        InputFormat<?, ?> format = formats.get(formatAndMapper.formatClass);
        List<? extends InputSplit> splits;
        if (format instanceof FileInputFormat<?, ?>) {
            FileInputFormat.setInputPaths(temporaryJob, toPathArray(current));
            splits = format.getSplits(temporaryJob);
        } else if (format instanceof BridgeInputFormat) {
            splits = ((BridgeInputFormat) format).getSplits(context, current);
        } else if (format instanceof TemporaryInputFormat<?>) {
            splits = ((TemporaryInputFormat<?>) format).getSplits(context, current);
        } else {
            splits = format.getSplits(temporaryJob);
        }
        assert format != null : formatAndMapper.formatClass.getName();
        Class<? extends Mapper<?, ?, ?, ?>> mapper = formatAndMapper.mapperClass;
        for (InputSplit split : splits) {
            Source source = new Source(split, formatAndMapper.formatClass);
            StageInputSplit wrapped = new StageInputSplit(mapper, Collections.singletonList(source));
            wrapped.setConf(context.getConfiguration());
            results.add(wrapped);
        }
    }
    Cache.put(context, inputs, results);
    return results;
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = Lists.newArrayList();
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs
            .getFormatNodeMap(jobCopy);//w w  w  . j  a  va 2  s .co  m

    // First, build a map of InputFormats to Paths
    for (Map.Entry<Class<? extends InputFormat>, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
        Class<? extends InputFormat> formatClass = entry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
            Integer nodeIndex = nodeEntry.getKey();
            List<Path> paths = nodeEntry.getValue();
            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new CrunchInputSplit(pathSplit, formatClass, nodeIndex, conf));
            }
        }
    }
    return splits;
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();

    Map<Path, String> formatMap = PangoolMultipleInputs.getInputFormatMap(job);
    Map<Path, String> mapperMap = PangoolMultipleInputs.getInputProcessorFileMap(job);

    for (Map.Entry<Path, String> entry : formatMap.entrySet()) {
        FileInputFormat.setInputPaths(jobCopy, entry.getKey());
        InputFormat inputFormat = InstancesDistributor.loadInstance(conf, InputFormat.class, entry.getValue(),
                true);/*from   w ww.  j a  v  a2  s . c o m*/
        PangoolMultipleInputs.setSpecificInputContext(jobCopy.getConfiguration(), entry.getValue());
        List<InputSplit> pathSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit pathSplit : pathSplits) {
            splits.add(new TaggedInputSplit(pathSplit, conf, entry.getValue(), mapperMap.get(entry.getKey())));
        }
    }

    return splits;
}

From source file:com.kylinolap.job.hadoop.AbstractHadoopJob.java

License:Apache License

protected double getTotalMapInputMB()
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }//from w  w  w.j  a va2 s.com

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

From source file:com.kylinolap.job.hadoop.AbstractHadoopJob.java

License:Apache License

protected int getMapInputSplitCount()
        throws ClassNotFoundException, JobException, IOException, InterruptedException {
    if (job == null) {
        throw new JobException("Job is null");
    }//ww  w. j ava  2  s .co  m
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    return input.getSplits(job).size();
}

From source file:com.linkedin.cubert.io.CubertInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    ConfigurationDiff confDiff = new ConfigurationDiff(conf);

    int numMultiMappers = confDiff.getNumDiffs();

    List<InputSplit> splits = new ArrayList<InputSplit>();

    for (int mapperIndex = 0; mapperIndex < numMultiMappers; mapperIndex++) {
        // reset conf to multimapper i
        confDiff.applyDiff(mapperIndex);

        // get the actual input format
        InputFormat<K, V> actualInputFormat = getActualInputFormat(context);

        List<InputSplit> actualSplits = null;

        // check if combined input split is requested
        boolean combineSplit = conf.getBoolean(CubertStrings.COMBINED_INPUT, false);

        if (combineSplit) {
            // Create CombinedFileInputFormat
            CombineFileInputFormat<K, V> cfif = new CombineFileInputFormat<K, V>() {
                @Override/*w  ww  . j a  v  a 2  s  . c  om*/
                public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
                        throws IOException {
                    throw new IllegalStateException("Should not be called");
                }
            };

            // get the splits
            actualSplits = cfif.getSplits(context);
        } else {
            actualSplits = actualInputFormat.getSplits(context);
        }

        // embed each split in MultiMapperSplit and add to list
        for (InputSplit actualSplit : actualSplits)
            splits.add(new MultiMapperSplit(actualSplit, mapperIndex));

        // undo the diff
        confDiff.undoDiff(mapperIndex);
    }
    return splits;
}