Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getSplits

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat getSplits

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getSplits.

Prototype

public List<InputSplit> getSplits(JobContext job) throws IOException 

Source Link

Document

Generate the list of files and make them into FileSplits.

Usage

From source file:ca.uwaterloo.iss4e.hadoop.io.CartesianInputFormat.java

License:Open Source License

private List<InputSplit> getInputSplits(JobContext jobContext, String inputFormatClass, Path path)
        throws ClassNotFoundException, IOException {
    Configuration conf = jobContext.getConfiguration();
    FileInputFormat inputFormat = (FileInputFormat) ReflectionUtils.newInstance(Class.forName(inputFormatClass),
            conf);//  w  w  w  . ja va 2s.c o m

    // Set the input path for the left data set
    path = path.getFileSystem(conf).makeQualified(path);
    String dirStr = StringUtils.escapeString(path.toString());
    String dirs = conf.get(INPUT_DIR);
    conf.set(INPUT_DIR, dirStr);
    return inputFormat.getSplits(jobContext);
}

From source file:com.asakusafw.testdriver.file.FileInputFormatDriver.java

License:Apache License

/**
 * Creates a new instance./*from  w ww . j av a  2  s . c o  m*/
 * @param context target context with source information
 * @param definition the data model definition
 * @param format the input format
 * @throws IOException if failed to initialize
 * @throws IllegalArgumentException if some parameters were {@code null}
 */
FileInputFormatDriver(DataModelDefinition<V> definition, TaskAttemptContext context,
        FileInputFormat<?, V> format) throws IOException {
    if (definition == null) {
        throw new IllegalArgumentException("definition must not be null"); //$NON-NLS-1$
    }
    if (context == null) {
        throw new IllegalArgumentException("context must not be null"); //$NON-NLS-1$
    }
    if (format == null) {
        throw new IllegalArgumentException("format must not be null"); //$NON-NLS-1$
    }
    LOG.debug("Emulating InputFormat: {}", format.getClass().getName());
    this.definition = definition;
    this.context = context;
    this.format = format;

    LOG.debug("Computing input splits: {}", format.getClass().getName());
    this.splits = new LinkedList<>(format.getSplits(context));
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * Go through each original inputsplit, get its file path, and check the
 *  index file,// w  w  w. j a v  a  2s .c  o m
 * a)  keep it, when there is no index prebuilt on this file
 *  (or the index file doesn't match with the base file's checksum;
 * b)  remove it when no matching value is found in existing index file;
 * c)  construct new smaller inputsplits using indexed blocks found
 * in the index file;
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    String inputformat = job.getConfiguration().get(REALINPUTFORMAT);
    String valueClass = job.getConfiguration().get(VALUECLASS);

    List<InputSplit> filteredList = new ArrayList<InputSplit>();

    FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass);

    List<InputSplit> splits = realInputFormat.getSplits(job);

    //if indexing jobs, don't skip any input splits.
    //if searching job but no searching filter, skip the index as well.
    if (isIndexingJob(job) || getFilterCondition(job) == null)
        return splits;

    Path prevFile = null; // remember the last input file we saw
    boolean foundIndexedFile = false; // is there a index file for
    // prevFile?
    boolean firstTime = true; // is this the first time we see this file?

    long totalOriginalBytes = 0; //the bytes to be scanned without indexes.
    totalBytesNewSplits = 0;
    long startTime = System.currentTimeMillis();
    LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes");
    Configuration conf = job.getConfiguration();
    long splitMaxSize;

    // for each original input split check if we can filter it out.
    for (InputSplit split : splits) {
        FileSplit fileSplit = (FileSplit) split;
        Path path = fileSplit.getPath();
        splitLength = fileSplit.getLength();
        totalOriginalBytes += fileSplit.getLength();
        splitMaxSize = Math.max(splitLength,
                conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024)));

        /*
         * for each new file we see, we first check if it has been indexed or not;
         * if not, we just add the original input split; if yes, we use the index
         * file to add filtered splits for the file
         */
        if (prevFile != null && path.equals(prevFile)) {
            firstTime = false;
        } else {
            prevFile = path;
            firstTime = true;
            foundIndexedFile = foundIndexFile(job, path);
        }

        // if no index file, we'll have to read all original input
        // splits
        if (!foundIndexedFile)
            filteredList.add(fileSplit);
        else {
            // for each file we only add once its filtered input splits using index
            // file
            if (firstTime) {
                // LOG.info("first time saw " + path
                // + ", adding filtered splits from index file");
                filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize));
            }
        }
    }

    long endTime = System.currentTimeMillis();
    LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: "
            + (endTime - startTime) / 1000);
    LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s",
            totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits)));
    return filteredList;
}

From source file:gobblin.source.extractor.hadoop.HadoopFileInputSource.java

License:Apache License

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    try {/* w  ww .  j a v a2s .com*/
        Job job = Job.getInstance(new Configuration());

        if (state.contains(FILE_INPUT_PATHS_KEY)) {
            for (String inputPath : state.getPropAsList(FILE_INPUT_PATHS_KEY)) {
                FileInputFormat.addInputPath(job, new Path(inputPath));
            }
        }

        FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
        List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
        if (fileSplits == null || fileSplits.isEmpty()) {
            return ImmutableList.of();
        }

        Extract.TableType tableType = state.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)
                ? Extract.TableType
                        .valueOf(state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY).toUpperCase())
                : null;
        String tableNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
        String tableName = state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY);

        List<WorkUnit> workUnits = Lists.newArrayListWithCapacity(fileSplits.size());
        for (InputSplit inputSplit : fileSplits) {
            // Create one WorkUnit per InputSplit
            FileSplit fileSplit = (FileSplit) inputSplit;
            Extract extract = createExtract(tableType, tableNamespace, tableName);
            WorkUnit workUnit = WorkUnit.create(extract);
            workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
            workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
            workUnits.add(workUnit);
        }

        return workUnits;
    } catch (IOException ioe) {
        throw new RuntimeException("Failed to get workunits", ioe);
    }
}

From source file:it.crs4.seal.tsv_sort.TextSampler.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 20 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param inFormat The input to sample//from w ww.j a v a2  s  .  c  om
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile)
        throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf);

    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT);
    List<InputSplit> splits = inFormat.getSplits(job);
    int samples = Math.min(MAX_SLICES_SAMPLED, splits.size());
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.size() / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        InputSplit isplit = splits.get(sampleStep * i);
        RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext);
        reader.initialize(isplit, taskContext);
        while (reader.nextKeyValue()) {
            sampler.addKey(reader.getCurrentKey());
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile))
        outFs.delete(partFile, false);

    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

From source file:org.kitesdk.data.spi.filesystem.InputFormatReader.java

License:Apache License

@Override
public void initialize() {
    Preconditions.checkState(ReaderWriterState.NEW.equals(state),
            "A reader may not be opened more than once - current state:%s", state);

    try {//from  ww  w .j  a  va 2  s . c  o m
        FileInputFormat format = InputFormatUtil.newInputFormatInstance(descriptor);
        Job job = Hadoop.Job.newInstance.invoke(conf);

        FileInputFormat.addInputPath(job, path);
        // attempt to minimize the number of InputSplits
        FileStatus stat = fs.getFileStatus(path);
        FileInputFormat.setMaxInputSplitSize(job, stat.getLen());

        this.splits = format.getSplits(job).iterator();
        this.shouldAdvance = true;
        this.state = ReaderWriterState.OPEN;

    } catch (RuntimeException e) {
        this.state = ReaderWriterState.ERROR;
        throw new DatasetOperationException("Cannot calculate splits", e);
    } catch (IOException e) {
        this.state = ReaderWriterState.ERROR;
        throw new DatasetIOException("Cannot calculate splits", e);
    }
}