Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getSplits

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat getSplits


In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getSplits.


public List<InputSplit> getSplits(JobContext job) throws IOException 

Source Link


Generate the list of files and make them into FileSplits.


From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianInputFormat.java

License:Open Source License

private List<InputSplit> getInputSplits(JobContext jobContext, String inputFormatClass, Path path)
        throws ClassNotFoundException, IOException {
    Configuration conf = jobContext.getConfiguration();
    FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass),
            conf);//  w  w  w  . ja va 2s.c o m

    // Set the input path for the left data set
    path = path.getFileSystem(conf).makeQualified(path);
    String dirStr = StringUtils.escapeString(path.toString());
    String dirs = conf.get(INPUT_DIR);
    conf.set(INPUT_DIR, dirStr);
    return inputFormat.getSplits(jobContext);

From source file:com.asakusafw.testdriver.file.FileInputFormatDriver.java

License:Apache License

 * Creates a new instance./*from  w ww . j av a  2  s . c o  m*/
 * @param context target context with source information
 * @param definition the data model definition
 * @param format the input format
 * @throws IOException if failed to initialize
 * @throws IllegalArgumentException if some parameters were {@code null}
FileInputFormatDriver(DataModelDefinition<V> definition, TaskAttemptContext context,
        FileInputFormat<?, V> format) throws IOException {
    if (definition == null) {
        throw new IllegalArgumentException("definition must not be null"); //$NON-NLS-1$
    if (context == null) {
        throw new IllegalArgumentException("context must not be null"); //$NON-NLS-1$
    if (format == null) {
        throw new IllegalArgumentException("format must not be null"); //$NON-NLS-1$
    LOG.debug("Emulating InputFormat: {}", format.getClass().getName());
    this.definition = definition;
    this.context = context;
    this.format = format;

    LOG.debug("Computing input splits: {}", format.getClass().getName());
    this.splits = new LinkedList<>(format.getSplits(context));

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

 * Go through each original inputsplit, get its file path, and check the
 *  index file,// w  w  w. j a v  a  2s .c  o m
 * a)  keep it, when there is no index prebuilt on this file
 *  (or the index file doesn't match with the base file's checksum;
 * b)  remove it when no matching value is found in existing index file;
 * c)  construct new smaller inputsplits using indexed blocks found
 * in the index file;
public List<InputSplit> getSplits(JobContext job) throws IOException {

    String inputformat = job.getConfiguration().get(REALINPUTFORMAT);
    String valueClass = job.getConfiguration().get(VALUECLASS);

    List<InputSplit> filteredList = new ArrayList<InputSplit>();

    FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass);

    List<InputSplit> splits = realInputFormat.getSplits(job);

    //if indexing jobs, don't skip any input splits.
    //if searching job but no searching filter, skip the index as well.
    if (isIndexingJob(job) || getFilterCondition(job) == null)
        return splits;

    Path prevFile = null; // remember the last input file we saw
    boolean foundIndexedFile = false; // is there a index file for
    // prevFile?
    boolean firstTime = true; // is this the first time we see this file?

    long totalOriginalBytes = 0; //the bytes to be scanned without indexes.
    totalBytesNewSplits = 0;
    long startTime = System.currentTimeMillis();
    LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes");
    Configuration conf = job.getConfiguration();
    long splitMaxSize;

    // for each original input split check if we can filter it out.
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        Path path = fileSplit.getPath();
        splitLength = fileSplit.getLength();
        totalOriginalBytes += fileSplit.getLength();
        splitMaxSize = Math.max(splitLength,
                conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024)));

         * for each new file we see, we first check if it has been indexed or not;
         * if not, we just add the original input split; if yes, we use the index
         * file to add filtered splits for the file
        if (prevFile != null && path.equals(prevFile)) {
            firstTime = false;
        } else {
            prevFile = path;
            firstTime = true;
            foundIndexedFile = foundIndexFile(job, path);

        // if no index file, we'll have to read all original input
        // splits
        if (!foundIndexedFile)
        else {
            // for each file we only add once its filtered input splits using index
            // file
            if (firstTime) {
                // LOG.info("first time saw " + path
                // + ", adding filtered splits from index file");
                filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize));

    long endTime = System.currentTimeMillis();
    LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: "
            + (endTime - startTime) / 1000);
    LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s",
            totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits)));
    return filteredList;

From source file:gobblin.source.extractor.hadoop.HadoopFileInputSource.java

License:Apache License

public List<WorkUnit> getWorkunits(SourceState state) {
    try {/* w  ww .  j a v a2s .com*/
        Job job = Job.getInstance(new Configuration());

        if (state.contains(FILE_INPUT_PATHS_KEY)) {
            for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
                FileInputFormat.addInputPath(job, new Path(inputPath));

        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
        List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
        if (fileSplits == null || fileSplits.isEmpty()) {
            return ImmutableList.of();

        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)
                ? Extract.TableType
                : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);

        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());

        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);

From source file:it.crs4.seal.tsv_sort.TextSampler.java

License:Apache License

 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 20 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param inFormat The input to sample//from w ww.j a v a2  s  .  c  om
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile)
        throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf);

    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT);
    List<InputSplit> splits = inFormat.getSplits(job);
    int samples = Math.min(MAX_SLICES_SAMPLED, splits.size());
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.size() / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        InputSplit isplit = splits.get(sampleStep * i);
        RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext);
        reader.initialize(isplit, taskContext);
        while (reader.nextKeyValue()) {
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile))
        outFs.delete(partFile, false);

    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);

From source file:org.kitesdk.data.spi.filesystem.InputFormatReader.java

License:Apache License

public void initialize() {
            "A reader may not be opened more than once - current state:%s", state);

    try {//from  ww  w .j  a  va 2  s . c  o m
        FileInputFormat format = InputFormatUtil.newInputFormatInstance(descriptor);
        Job job = Hadoop.Job.newInstance.invoke(conf);

        FileInputFormat.addInputPath(job, path);
        // attempt to minimize the number of InputSplits
        FileStatus stat = fs.getFileStatus(path);
        FileInputFormat.setMaxInputSplitSize(job, stat.getLen());

        this.splits = format.getSplits(job).iterator();
        this.shouldAdvance = true;
        this.state = ReaderWriterState.OPEN;

    } catch (RuntimeException e) {
        this.state = ReaderWriterState.ERROR;
        throw new DatasetOperationException("Cannot calculate splits", e);
    } catch (IOException e) {
        this.state = ReaderWriterState.ERROR;
        throw new DatasetIOException("Cannot calculate splits", e);