Example usage for org.apache.hadoop.mapreduce InputFormat getSplits

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputFormat getSplits.

Prototype

public abstract List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException;

Source Link

Document

Logically split the set of input files for the job.

Usage

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = new ArrayList<>();
    Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());

    for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
        String inputName = mapperInputEntry.getKey();
        MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
        String mapperClassName = mapperInput.getMapperClassName();
        Job jobCopy = new Job(job.getConfiguration());
        Configuration confCopy = jobCopy.getConfiguration();

        // set configuration specific for this input onto the jobCopy
        ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);

        Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
        Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ",
                mapperInput.getInputFormatClassName());
        InputFormat inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);

        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a TaggedInputSplit.
        List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit split : formatSplits) {
            splits.add(new TaggedInputSplit(inputName, split, confCopy,
                    mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
        }/*from  w  w  w.j  a v  a 2  s  . c o m*/
    }
    return splits;
}

From source file:co.cask.cdap.internal.app.runtime.batch.dataset.input.MultiInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = new ArrayList<>();
    Map<String, MultipleInputs.MapperInput> mapperInputMap = MultipleInputs.getInputMap(job.getConfiguration());

    for (Map.Entry<String, MultipleInputs.MapperInput> mapperInputEntry : mapperInputMap.entrySet()) {
        String inputName = mapperInputEntry.getKey();
        MultipleInputs.MapperInput mapperInput = mapperInputEntry.getValue();
        String mapperClassName = mapperInput.getMapperClassName();
        Job jobCopy = new Job(job.getConfiguration());
        Configuration confCopy = jobCopy.getConfiguration();

        // set configuration specific for this input onto the jobCopy
        ConfigurationUtil.setAll(mapperInput.getInputFormatConfiguration(), confCopy);

        Class<?> inputFormatClass = confCopy.getClassByNameOrNull(mapperInput.getInputFormatClassName());
        Preconditions.checkNotNull(inputFormatClass, "Class could not be found: ",
                mapperInput.getInputFormatClassName());

        InputFormat<K, V> inputFormat = (InputFormat) ReflectionUtils.newInstance(inputFormatClass, confCopy);
        //some input format need a jobId to getSplits
        jobCopy.setJobID(new JobID(inputName, inputName.hashCode()));

        // Get splits for each input path and tag with InputFormat
        // and Mapper types by wrapping in a MultiInputTaggedSplit.
        List<InputSplit> formatSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit split : formatSplits) {
            splits.add(new MultiInputTaggedSplit(split, confCopy, inputName,
                    mapperInput.getInputFormatConfiguration(), inputFormat.getClass(), mapperClassName));
        }//from  ww w  . ja  v a  2  s  .  co  m
    }
    return splits;
}

From source file:com.ambiata.ivory.operation.hadoop.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<Path, InputFormat> formatMap = MultipleInputs.getInputFormatMap(job);
    Map<Path, Class<? extends Mapper>> mapperMap = MultipleInputs.getMapperTypeMap(job);
    Map<Class<? extends InputFormat>, List<Path>> formatPaths = new HashMap<Class<? extends InputFormat>, List<Path>>();

    // First, build a map of InputFormats to Paths
    for (Entry<Path, InputFormat> entry : formatMap.entrySet()) {
        if (!formatPaths.containsKey(entry.getValue().getClass())) {
            formatPaths.put(entry.getValue().getClass(), new LinkedList<Path>());
        }/* www . j a  v a  2 s.  com*/

        formatPaths.get(entry.getValue().getClass()).add(entry.getKey());
    }

    for (Entry<Class<? extends InputFormat>, List<Path>> formatEntry : formatPaths.entrySet()) {
        Class<? extends InputFormat> formatClass = formatEntry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        List<Path> paths = formatEntry.getValue();

        Map<Class<? extends Mapper>, List<Path>> mapperPaths = new HashMap<Class<? extends Mapper>, List<Path>>();

        // Now, for each set of paths that have a common InputFormat, build
        // a map of Mappers to the paths they're used for
        for (Path path : paths) {
            Class<? extends Mapper> mapperClass = mapperMap.get(path);
            if (!mapperPaths.containsKey(mapperClass)) {
                mapperPaths.put(mapperClass, new LinkedList<Path>());
            }

            mapperPaths.get(mapperClass).add(path);
        }

        // Now each set of paths that has a common InputFormat and Mapper can
        // be added to the same job, and split together.
        for (Entry<Class<? extends Mapper>, List<Path>> mapEntry : mapperPaths.entrySet()) {
            paths = mapEntry.getValue();
            Class<? extends Mapper> mapperClass = mapEntry.getKey();

            if (mapperClass == null) {
                try {
                    mapperClass = job.getMapperClass();
                } catch (ClassNotFoundException e) {
                    throw new IOException("Mapper class is not found", e);
                }
            }

            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new TaggedInputSplit(pathSplit, conf, format.getClass(), mapperClass));
            }
        }
    }

    return splits;
}

From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunner.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
private void runMap(Job job, KeyValueSorter<?, ?> sorter)
        throws IOException, InterruptedException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    List<InputSplit> splits = input.getSplits(job);
    int serial = 1;
    for (InputSplit split : splits) {
        TaskAttemptID id = newTaskAttemptId(newMapTaskId(job.getJobID(), serial++), 0);
        Mapper<?, ?, ?, ?> mapper = ReflectionUtils.newInstance(job.getMapperClass(), conf);
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("starting mapper: {0}@{1} ({2}bytes)", //$NON-NLS-1$
                    mapper.getClass().getName(), id, split.getLength()));
        }// w  w w  .ja  v  a  2  s.co  m
        TaskAttemptContext context = newTaskAttemptContext(conf, id);
        // we always obtain a new OutputFormat object / OutputFormat.getOutputCommiter() may be cached
        OutputFormat<?, ?> output = ReflectionUtils.newInstance(job.getOutputFormatClass(), conf);
        OutputCommitter committer = output.getOutputCommitter(context);
        committer.setupTask(context);
        boolean succeed = false;
        try (RecordReader<?, ?> reader = input.createRecordReader(split, newTaskAttemptContext(conf, id))) {
            RecordWriter<?, ?> writer;
            if (sorter != null) {
                writer = new ShuffleWriter(sorter);
            } else {
                writer = output.getRecordWriter(newTaskAttemptContext(conf, id));
            }
            try {
                Mapper.Context c = newMapperContext(conf, id, reader, writer, committer, split);
                reader.initialize(split, c);
                mapper.run(c);
            } finally {
                writer.close(newTaskAttemptContext(conf, id));
            }
            doCommitTask(context, committer);
            succeed = true;
        } finally {
            if (succeed == false) {
                doAbortTask(context, committer);
            }
        }
    }
}

From source file:com.asakusafw.runtime.stage.input.StageInputFormat.java

License:Apache License

static List<StageInputSplit> computeSplits(JobContext context) throws IOException, InterruptedException {
    assert context != null;
    List<StageInput> inputs = StageInputDriver.getInputs(context.getConfiguration());
    List<StageInputSplit> cached = Cache.find(context, inputs);
    if (cached != null) {
        return cached;
    }/*from www.  j  a v  a  2s  . com*/
    Map<FormatAndMapper, List<StageInput>> paths = groupByFormatAndMapper(inputs);
    Map<Class<? extends InputFormat<?, ?>>, InputFormat<?, ?>> formats = instantiateFormats(context,
            paths.keySet());
    Job temporaryJob = JobCompatibility.newJob(context.getConfiguration());
    List<StageInputSplit> results = new ArrayList<>();
    for (Map.Entry<FormatAndMapper, List<StageInput>> entry : paths.entrySet()) {
        FormatAndMapper formatAndMapper = entry.getKey();
        List<StageInput> current = entry.getValue();
        InputFormat<?, ?> format = formats.get(formatAndMapper.formatClass);
        List<? extends InputSplit> splits;
        if (format instanceof FileInputFormat<?, ?>) {
            FileInputFormat.setInputPaths(temporaryJob, toPathArray(current));
            splits = format.getSplits(temporaryJob);
        } else if (format instanceof BridgeInputFormat) {
            splits = ((BridgeInputFormat) format).getSplits(context, current);
        } else if (format instanceof TemporaryInputFormat<?>) {
            splits = ((TemporaryInputFormat<?>) format).getSplits(context, current);
        } else {
            splits = format.getSplits(temporaryJob);
        }
        assert format != null : formatAndMapper.formatClass.getName();
        Class<? extends Mapper<?, ?, ?, ?>> mapper = formatAndMapper.mapperClass;
        for (InputSplit split : splits) {
            Source source = new Source(split, formatAndMapper.formatClass);
            StageInputSplit wrapped = new StageInputSplit(mapper, Collections.singletonList(source));
            wrapped.setConf(context.getConfiguration());
            results.add(wrapped);
        }
    }
    Cache.put(context, inputs, results);
    return results;
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = Lists.newArrayList();
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs
            .getFormatNodeMap(jobCopy);//w w  w  . j  a  va 2  s .co  m

    // First, build a map of InputFormats to Paths
    for (Map.Entry<Class<? extends InputFormat>, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
        Class<? extends InputFormat> formatClass = entry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
            Integer nodeIndex = nodeEntry.getKey();
            List<Path> paths = nodeEntry.getValue();
            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new CrunchInputSplit(pathSplit, formatClass, nodeIndex, conf));
            }
        }
    }
    return splits;
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.DelegatingInputFormat.java

License:Apache License

@SuppressWarnings("unchecked")
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    List<InputSplit> splits = new ArrayList<InputSplit>();

    Map<Path, String> formatMap = PangoolMultipleInputs.getInputFormatMap(job);
    Map<Path, String> mapperMap = PangoolMultipleInputs.getInputProcessorFileMap(job);

    for (Map.Entry<Path, String> entry : formatMap.entrySet()) {
        FileInputFormat.setInputPaths(jobCopy, entry.getKey());
        InputFormat inputFormat = InstancesDistributor.loadInstance(conf, InputFormat.class, entry.getValue(),
                true);/*from   w ww.  j a  v  a2  s . c o m*/
        PangoolMultipleInputs.setSpecificInputContext(jobCopy.getConfiguration(), entry.getValue());
        List<InputSplit> pathSplits = inputFormat.getSplits(jobCopy);
        for (InputSplit pathSplit : pathSplits) {
            splits.add(new TaggedInputSplit(pathSplit, conf, entry.getValue(), mapperMap.get(entry.getKey())));
        }
    }

    return splits;
}

From source file:com.kylinolap.job.hadoop.AbstractHadoopJob.java

License:Apache License

protected double getTotalMapInputMB()
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }//from w  w  w.j  a va2 s.com

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

From source file:com.kylinolap.job.hadoop.AbstractHadoopJob.java

License:Apache License

protected int getMapInputSplitCount()
        throws ClassNotFoundException, JobException, IOException, InterruptedException {
    if (job == null) {
        throw new JobException("Job is null");
    }//ww  w. j ava  2  s .co  m
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    return input.getSplits(job).size();
}

From source file:com.linkedin.cubert.io.CubertInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    Configuration conf = context.getConfiguration();
    ConfigurationDiff confDiff = new ConfigurationDiff(conf);

    int numMultiMappers = confDiff.getNumDiffs();

    List<InputSplit> splits = new ArrayList<InputSplit>();

    for (int mapperIndex = 0; mapperIndex < numMultiMappers; mapperIndex++) {
        // reset conf to multimapper i
        confDiff.applyDiff(mapperIndex);

        // get the actual input format
        InputFormat<K, V> actualInputFormat = getActualInputFormat(context);

        List<InputSplit> actualSplits = null;

        // check if combined input split is requested
        boolean combineSplit = conf.getBoolean(CubertStrings.COMBINED_INPUT, false);

        if (combineSplit) {
            // Create CombinedFileInputFormat
            CombineFileInputFormat<K, V> cfif = new CombineFileInputFormat<K, V>() {
                @Override/*w  ww  . j a  v  a 2  s  . c  om*/
                public RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
                        throws IOException {
                    throw new IllegalStateException("Should not be called");
                }
            };

            // get the splits
            actualSplits = cfif.getSplits(context);
        } else {
            actualSplits = actualInputFormat.getSplits(context);
        }

        // embed each split in MultiMapperSplit and add to list
        for (InputSplit actualSplit : actualSplits)
            splits.add(new MultiMapperSplit(actualSplit, mapperIndex));

        // undo the diff
        confDiff.undoDiff(mapperIndex);
    }
    return splits;
}