Example usage for org.apache.hadoop.mapred FileInputFormat getSplits

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getSplits.

Prototype

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException

Source Link

Document

Splits files returned by #listStatus(JobConf) when they're too big.

Usage

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList
            .get(idx);/*from  ww w.j  ava 2 s  .  co m*/

    try {
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.m6d.filecrush.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = getInputFormatClass(idx);

    try {//from  www  . j a  v a 2  s.c  o m
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:gobblin.source.extractor.hadoop.OldApiHadoopFileInputSource.java

License:Apache License

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    JobConf jobConf = new JobConf(new Configuration());
    for (String key : state.getPropertyNames()) {
        jobConf.set(key, state.getProp(key));
    }/*from   w  w w  .  j a  va  2s  . c o  m*/

    if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
        for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
            FileInputFormat.addInputPath(jobConf, new Path(inputPath));
        }
    }

    try {
        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, jobConf);
        InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf,
                state.getPropAsInt(HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY,
                        HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED));
        if (fileSplits == null || fileSplits.length == 0) {
            return ImmutableList.of();
        }

        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)
                ? Extract.TableType
                        .valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase())
                : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);

        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.length);
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY,
                    HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
            workUnits.add(workUnit);
        }

        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);
    }
}

From source file:org.pooledtimeseries.cartesian.CartesianInputFormat.java

License:Apache License

private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass, String inputPath, int numSplits)
        throws ClassNotFoundException, IOException {
    // Create a new instance of the input format
    FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass),
            conf);/*www .  j  ava 2s.  co m*/

    // Set the input path for the left data set
    inputFormat.setInputPaths(conf, inputPath);

    // Get the left input splits
    return inputFormat.getSplits(conf, numSplits);
}

From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java

License:Apache License

private static List<String> collect(FileInputFormat<Text, Text> format, JobConf job, int n, Reporter reporter)
        throws IOException {
    List<String> found = new ArrayList<String>();
    for (InputSplit split : format.getSplits(job, n)) {
        RecordReader<Text, Text> reader = format.getRecordReader(split, job, reporter);
        Text key = reader.createKey();
        Text value = reader.createValue();
        try {/*from  w w  w.ja  v  a  2 s  .  c o  m*/
            while (reader.next(key, value)) {
                found.add(key.toString());
            }
        } finally {
            reader.close();
        }
    }
    return found;
}