Example usage for org.apache.hadoop.mapred FileInputFormat getSplits

List of usage examples for org.apache.hadoop.mapred FileInputFormat getSplits

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getSplits.

Prototype

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException 

Source Link

Document

Splits files returned by #listStatus(JobConf) when they're too big.

Usage

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList
            .get(idx);/*from  ww w.j  ava 2 s  .  co m*/

    try {
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.m6d.filecrush.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = getInputFormatClass(idx);

    try {//from  www  . j a  v a 2  s.c  o m
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:gobblin.source.extractor.hadoop.OldApiHadoopFileInputSource.java

License:Apache License

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    JobConf jobConf = new JobConf(new Configuration());
    for (String key : state.getPropertyNames()) {
        jobConf.set(key, state.getProp(key));
    }/*from   w  w w  .  j a  va  2s  . c o  m*/

    if (state.contains(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
        for (String inputPath : state.getPropAsList(HadoopFileInputSource.FILE_INPUT_PATHS_KEY)) {
            FileInputFormat.addInputPath(jobConf, new Path(inputPath));
        }
    }

    try {
        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, jobConf);
        InputSplit[] fileSplits = fileInputFormat.getSplits(jobConf,
                state.getPropAsInt(HadoopFileInputSource.FILE_SPLITS_DESIRED_KEY,
                        HadoopFileInputSource.DEFAULT_FILE_SPLITS_DESIRED));
        if (fileSplits == null || fileSplits.length == 0) {
            return ImmutableList.of();
        }

        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)
                ? Extract.TableType
                        .valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase())
                : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);

        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.length);
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY,
                    HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
            workUnits.add(workUnit);
        }

        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);
    }
}

From source file:org.pooledtimeseries.cartesian.CartesianInputFormat.java

License:Apache License

private InputSplit[] getInputSplits(JobConf conf, String inputFormatClass, String inputPath, int numSplits)
        throws ClassNotFoundException, IOException {
    // Create a new instance of the input format
    FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass),
            conf);/*www .  j  ava 2s.  co m*/

    // Set the input path for the left data set
    inputFormat.setInputPaths(conf, inputPath);

    // Get the left input splits
    return inputFormat.getSplits(conf, numSplits);
}

From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java

License:Apache License

private static List<String> collect(FileInputFormat<Text, Text> format, JobConf job, int n, Reporter reporter)
        throws IOException {
    List<String> found = new ArrayList<String>();
    for (InputSplit split : format.getSplits(job, n)) {
        RecordReader<Text, Text> reader = format.getRecordReader(split, job, reporter);
        Text key = reader.createKey();
        Text value = reader.createValue();
        try {/*from  w w  w.ja  v  a  2 s  .  c o  m*/
            while (reader.next(key, value)) {
                found.add(key.toString());
            }
        } finally {
            reader.close();
        }
    }
    return found;
}