Example usage for org.apache.hadoop.mapred JobConf getInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getInputFormat.

Prototype

public InputFormat getInputFormat()

Source Link

Document

Get the InputFormat implementation for the map-reduce job, defaults to TextInputFormat if not specified explicity.

Usage

From source file:org.apache.ignite.internal.processors.hadoop.v1.GridHadoopV1Splitter.java

License:Apache License

/**
 * @param jobConf Job configuration.//from ww  w .  j a  v  a2  s.c o m
 * @return Collection of mapped splits.
 * @throws IgniteCheckedException If mapping failed.
 */
public static Collection<GridHadoopInputSplit> splitJob(JobConf jobConf) throws IgniteCheckedException {
    try {
        InputFormat<?, ?> format = jobConf.getInputFormat();

        assert format != null;

        InputSplit[] splits = format.getSplits(jobConf, 0);

        Collection<GridHadoopInputSplit> res = new ArrayList<>(splits.length);

        for (int i = 0; i < splits.length; i++) {
            InputSplit nativeSplit = splits[i];

            if (nativeSplit instanceof FileSplit) {
                FileSplit s = (FileSplit) nativeSplit;

                res.add(new GridHadoopFileBlock(s.getLocations(), s.getPath().toUri(), s.getStart(),
                        s.getLength()));
            } else
                res.add(GridHadoopUtils.wrapSplit(i, nativeSplit, nativeSplit.getLocations()));
        }

        return res;
    } catch (IOException e) {
        throw new IgniteCheckedException(e);
    }
}

From source file:org.apache.ignite.internal.processors.hadoop.v1.HadoopV1MapTask.java

License:Apache License

/** {@inheritDoc} */
@SuppressWarnings("unchecked")
@Override//from w w  w . j  a v a2s  .c om
public void run(HadoopTaskContext taskCtx) throws IgniteCheckedException {
    HadoopJob job = taskCtx.job();

    HadoopV2TaskContext ctx = (HadoopV2TaskContext) taskCtx;

    JobConf jobConf = ctx.jobConf();

    InputFormat inFormat = jobConf.getInputFormat();

    HadoopInputSplit split = info().inputSplit();

    InputSplit nativeSplit;

    if (split instanceof HadoopFileBlock) {
        HadoopFileBlock block = (HadoopFileBlock) split;

        nativeSplit = new FileSplit(new Path(block.file().toString()), block.start(), block.length(),
                EMPTY_HOSTS);
    } else
        nativeSplit = (InputSplit) ctx.getNativeSplit(split);

    assert nativeSplit != null;

    Reporter reporter = new HadoopV1Reporter(taskCtx);

    HadoopV1OutputCollector collector = null;

    try {
        collector = collector(jobConf, ctx, !job.info().hasCombiner() && !job.info().hasReducer(), fileName(),
                ctx.attemptId());

        RecordReader reader = inFormat.getRecordReader(nativeSplit, jobConf, reporter);

        Mapper mapper = ReflectionUtils.newInstance(jobConf.getMapperClass(), jobConf);

        Object key = reader.createKey();
        Object val = reader.createValue();

        assert mapper != null;

        try {
            try {
                while (reader.next(key, val)) {
                    if (isCancelled())
                        throw new HadoopTaskCancelledException("Map task cancelled.");

                    mapper.map(key, val, collector, reporter);
                }
            } finally {
                mapper.close();
            }
        } finally {
            collector.closeWriter();
        }

        collector.commit();
    } catch (Exception e) {
        if (collector != null)
            collector.abort();

        throw new IgniteCheckedException(e);
    }
}

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected void runJob(JobConf job) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);//from   w  w w .  ja  v  a2 s  . c  o m

    int numTrees = Builder.getNbTrees(job); // total number of trees

    firstOutput = new PartialOutputCollector(numTrees);
    Reporter reporter = Reporter.NULL;

    firstIds = new int[splits.length];
    sizes = new int[splits.length];

    // to compute firstIds, process the splits in file order
    int firstId = 0;
    long slowest = 0; // duration of slowest map
    for (InputSplit split : splits) {
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, splits.length,
                numTrees);

        long time = System.currentTimeMillis();

        firstIds[hp] = firstId;

        while (reader.next(key, value)) {
            mapper.map(key, value, firstOutput, reporter);
            firstId++;
            sizes[hp]++;
        }

        mapper.close();

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapred.partial.PartialSequentialBuilder.java

License:Apache License

/**
 * The second step uses the trees to predict the rest of the instances outside
 * their own partition//w w  w . j  a va2  s .c  om
 * 
 * @throws IOException
 * 
 */
void secondStep(JobConf job, Path forestPath, PredictionCallback callback) throws IOException {
    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, job.getNumMapTasks());
    log.debug("Nb splits : {}", splits.length);

    Builder.sortSplits(splits);

    int numTrees = Builder.getNbTrees(job); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < splits.length; p++) {
        total += Step2Mapper.nbConcerned(splits.length, numTrees, p);
    }

    secondOutput = new PartialOutputCollector(total);
    Reporter reporter = Reporter.NULL;
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < splits.length; partition++) {
        InputSplit split = splits[partition];
        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        // load the output of the 1st step
        int nbConcerned = Step2Mapper.nbConcerned(splits.length, numTrees, partition);
        TreeID[] fsKeys = new TreeID[nbConcerned];
        Node[] fsTrees = new Node[nbConcerned];

        FileSystem fs = forestPath.getFileSystem(job);
        int numInstances = InterResults.load(fs, forestPath, splits.length, numTrees, partition, fsKeys,
                fsTrees);

        Step2Mapper mapper = new Step2Mapper();
        mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances);

        long time = System.currentTimeMillis();

        while (reader.next(key, value)) {
            mapper.map(key, value, secondOutput, reporter);
        }

        mapper.close();

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java

License:Apache License

public void testStep0Mapper() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, numAttributes);
    double[][] source = Utils.randomDoubles(rng, descriptor, numInstances);
    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    JobConf job = new JobConf();
    job.setNumMapTasks(numMaps);//from  w  w  w  . j av a 2s  .  co  m

    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Step0OutputCollector collector = new Step0OutputCollector(numMaps);
    Reporter reporter = Reporter.NULL;

    for (int p = 0; p < numMaps; p++) {
        InputSplit split = sorted[p];
        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        Step0Mapper mapper = new Step0Mapper();
        mapper.configure(p);

        Long firstKey = null;
        int size = 0;

        while (reader.next(key, value)) {
            if (firstKey == null) {
                firstKey = key.get();
            }

            mapper.map(key, value, collector, reporter);

            size++;
        }

        mapper.close();

        // validate the mapper's output
        assertEquals(p, collector.keys[p]);
        assertEquals(firstKey.longValue(), collector.values[p].getFirstId());
        assertEquals(size, collector.values[p].getSize());
    }

}

From source file:org.apache.mahout.df.mapred.partial.Step0JobTest.java

License:Apache License

public void testProcessOutput() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, numAttributes);
    double[][] source = Utils.randomDoubles(rng, descriptor, numInstances);

    // each instance label is its index in the dataset
    int labelId = Utils.findLabel(descriptor);
    for (int index = 0; index < numInstances; index++) {
        source[index][labelId] = index;//w  ww.  j a  v a2s. c om
    }

    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    JobConf job = new JobConf();
    job.setNumMapTasks(numMaps);
    FileInputFormat.setInputPaths(job, dataPath);

    // retrieve the splits
    TextInputFormat input = (TextInputFormat) job.getInputFormat();
    InputSplit[] splits = input.getSplits(job, numMaps);

    InputSplit[] sorted = Arrays.copyOf(splits, splits.length);
    Builder.sortSplits(sorted);

    Reporter reporter = Reporter.NULL;

    int[] keys = new int[numMaps];
    Step0Output[] values = new Step0Output[numMaps];

    int[] expectedIds = new int[numMaps];

    for (int p = 0; p < numMaps; p++) {
        InputSplit split = sorted[p];
        RecordReader<LongWritable, Text> reader = input.getRecordReader(split, job, reporter);

        LongWritable key = reader.createKey();
        Text value = reader.createValue();

        Long firstKey = null;
        int size = 0;

        while (reader.next(key, value)) {
            if (firstKey == null) {
                firstKey = key.get();
                expectedIds[p] = converter.convert(0, value.toString()).label;
            }

            size++;
        }

        keys[p] = p;
        values[p] = new Step0Output(firstKey, size);
    }

    Step0Output[] partitions = Step0Job.processOutput(keys, values);

    int[] actualIds = Step0Output.extractFirstIds(partitions);

    assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds),
            Arrays.equals(expectedIds, actualIds));
}

From source file:org.apache.reef.io.data.loading.impl.AbstractEvaluatorToPartitionStrategy.java

License:Apache License

@SuppressWarnings("rawtypes")
AbstractEvaluatorToPartitionStrategy(final String inputFormatClassName,
        final Set<String> serializedDataPartitions) {
    LOG.fine("AbstractEvaluatorToPartitionStrategy injected");
    Validate.notEmpty(inputFormatClassName);
    Validate.notEmpty(serializedDataPartitions);

    locationToSplits = new ConcurrentHashMap<>();
    evaluatorToSplits = new ConcurrentHashMap<>();
    unallocatedSplits = new LinkedBlockingQueue<>();
    setUp();//  ww  w  .ja va  2 s  .  co  m

    final Map<DistributedDataSetPartition, InputSplit[]> splitsPerPartition = new HashMap<>();
    for (final String serializedDataPartition : serializedDataPartitions) {
        final DistributedDataSetPartition dp = DistributedDataSetPartitionSerializer
                .deserialize(serializedDataPartition);
        final ExternalConstructor<JobConf> jobConfExternalConstructor = new JobConfExternalConstructor(
                inputFormatClassName, dp.getPath());
        try {
            final JobConf jobConf = jobConfExternalConstructor.newInstance();
            final InputFormat inputFormat = jobConf.getInputFormat();
            final InputSplit[] inputSplits = inputFormat.getSplits(jobConf, dp.getDesiredSplits());
            if (LOG.isLoggable(Level.FINEST)) {
                LOG.log(Level.FINEST, "Splits for partition: {0} {1}",
                        new Object[] { dp, Arrays.toString(inputSplits) });
            }
            this.totalNumberOfSplits += inputSplits.length;
            splitsPerPartition.put(dp, inputSplits);
        } catch (final IOException e) {
            throw new RuntimeException("Unable to get InputSplits using the specified InputFormat", e);
        }
    }
    init(splitsPerPartition);
    LOG.log(Level.FINE, "Total Number of splits: {0}", this.totalNumberOfSplits);
}

From source file:org.apache.reef.io.data.loading.impl.InputFormatExternalConstructor.java

License:Apache License

@Inject
public InputFormatExternalConstructor(final JobConf jobConf) {
    this.jobConf = jobConf;
    inputFormat = jobConf.getInputFormat();
}

From source file:org.apache.tez.mapreduce.hadoop.MRInputHelpers.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
private static org.apache.hadoop.mapred.InputSplit[] generateOldSplits(JobConf jobConf, boolean groupSplits,
        int numTasks) throws IOException {

    // This is the real InputFormat
    org.apache.hadoop.mapred.InputFormat inputFormat;
    try {/*w  w w  .  ja  v a  2  s . c om*/
        inputFormat = jobConf.getInputFormat();
    } catch (Exception e) {
        throw new TezUncheckedException(e);
    }

    org.apache.hadoop.mapred.InputFormat finalInputFormat = inputFormat;

    if (groupSplits) {
        org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat groupedFormat = new org.apache.hadoop.mapred.split.TezGroupedSplitsInputFormat();
        groupedFormat.setConf(jobConf);
        groupedFormat.setInputFormat(inputFormat);
        groupedFormat.setDesiredNumberOfSplits(numTasks);
        finalInputFormat = groupedFormat;
    } else {
        finalInputFormat = inputFormat;
    }
    org.apache.hadoop.mapred.InputSplit[] splits = finalInputFormat.getSplits(jobConf,
            jobConf.getNumMapTasks());
    // sort the splits into order based on size, so that the biggest
    // go first
    Arrays.sort(splits, new OldInputSplitComparator());
    return splits;
}

From source file:org.deeplearning4j.iterativereduce.runtime.yarn.appmaster.ApplicationMaster.java

License:Apache License

private Set<ConfigurationTuple> getConfigurationTuples() throws IOException {
    if (confTuples != null)
        return confTuples;
    Path inputPath = new Path(props.getProperty(ConfigFields.APP_INPUT_PATH));
    FileSystem fs = FileSystem.get(conf);
    FileStatus f = fs.getFileStatus(inputPath);
    //BlockLocation[] bl = fs.getFileBlockLocations(p, 0, f.getLen());
    Set<ConfigurationTuple> configTuples = new HashSet<>();
    int workerId = 0;

    JobConf job = new JobConf(new Configuration());

    job.setInputFormat((Class<? extends InputFormat>) this.inputFormatClass); //TextInputFormat.class);

    FileInputFormat.setInputPaths(job, inputPath);

    InputSplit[] splits = job.getInputFormat().getSplits(job, job.getNumMapTasks());

    for (InputSplit split : splits) {

        FileSplit convertedToMetronomeSplit = new FileSplit();

        org.apache.hadoop.mapred.FileSplit hadoopFileSplit = (org.apache.hadoop.mapred.FileSplit) split;

        if (hadoopFileSplit.getLength() - hadoopFileSplit.getStart() > 0) {
            convertedToMetronomeSplit.setLength(hadoopFileSplit.getLength());
            convertedToMetronomeSplit.setOffset(hadoopFileSplit.getStart());
            convertedToMetronomeSplit.setPath(hadoopFileSplit.getPath().toString());

            StartupConfiguration config = StartupConfiguration.newBuilder().setBatchSize(batchSize)
                    .setIterations(iterationCount).setOther(appConfig).setSplit(convertedToMetronomeSplit)
                    .build();//from   w  w w.  j  av a  2s.co m

            String wid = "worker-" + workerId;
            ConfigurationTuple tuple = new ConfigurationTuple(split.getLocations()[0], wid, config);

            configTuples.add(tuple);
            workerId++;

            LOG.info("IR_AM_worker: " + wid + " added split: " + convertedToMetronomeSplit.toString());

        } else {
            LOG.info("IR_AM: Culled out 0 length Split: " + convertedToMetronomeSplit.toString());
        }

    }

    LOG.info("Total Splits/Workers: " + configTuples.size());

    confTuples = configTuples;
    return configTuples;
}