Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:it.crs4.features.BioImgInputFormat.java

License:Apache License

public static String getMetadataFile(JobContext job) {
    return job.getConfiguration().get(META_FN);
}

From source file:it.crs4.pydoop.mapreduce.pipes.PipesNonJavaInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    return ReflectionUtils
            .newInstance(conf.getClass(Submitter.INPUT_FORMAT, TextInputFormat.class, InputFormat.class), conf)
            .getSplits(context);/*from ww w .  java2 s  .c  om*/
}

From source file:it.crs4.pydoop.mapreduce.TextInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    return context.getConfiguration().getBoolean("pydoop.input.issplitable", true);
}

From source file:it.crs4.seal.tsv_sort.TextSampler.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 20 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param inFormat The input to sample/* ww  w .j  a v  a 2  s  .  c  o  m*/
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile)
        throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf);

    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT);
    List<InputSplit> splits = inFormat.getSplits(job);
    int samples = Math.min(MAX_SLICES_SAMPLED, splits.size());
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.size() / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        InputSplit isplit = splits.get(sampleStep * i);
        RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext);
        reader.initialize(isplit, taskContext);
        while (reader.nextKeyValue()) {
            sampler.addKey(reader.getCurrentKey());
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile))
        outFs.delete(partFile, false);

    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    //It generates the splits.
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
        Path filePath = file.getPath();

        //Calculates the content (array of double) length in bytes.
        FileSystem fs = filePath.getFileSystem(job.getConfiguration());
        FSDataInputStream dis = fs.open(filePath);
        JMATReader _matReader = new JMATReader(dis);
        JMATInfo _matdata = _matReader.getInfo();

        long length = _matdata.dataNumOfItems * MLDataType.miDOUBLE.bytes; //Content length.
        long lContentByteOffset = dis.getPos();

        _matReader.close();//from  w  w w. ja v a2  s  .  c  om
        _matReader = null;
        dis = null;

        //Zero bytes, empty file split.
        if (length <= 0) {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(filePath, 0, length, new String[0]));
        }

        //Split the data.
        if (length > 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                blkLocations = fs.getFileBlockLocations(file, lContentByteOffset, length);
            }

            boolean isSplittable = isSplitable(job, filePath);
            LOG.debug("Current file to process " + filePath.getName() + ". Splittable? " + isSplittable);
            if (isSplittable) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;

                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(
                            makeSplit(filePath, lBlockByteStart, splitSize, blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                } //EndWhile.

                if (bytesRemaining != 0) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;
                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(makeSplit(filePath, lBlockByteStart, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(filePath, lContentByteOffset, length, blkLocations[0].getHosts()));
            }
        }
    } //EndFor.

    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    //A mat file can compress the single variables. When variables are
    //compressed, they cannot be splitted on Apache Spark. So, this 
    //piece of code determine whether the file is compressed or not and
    //whether it is splittable.
    try {//from  w ww . j  a  v a 2  s  .  c o m
        FileSystem fs = filename.getFileSystem(context.getConfiguration());
        FSDataInputStream dis = fs.open(filename);
        JMATReader _matReader = new JMATReader(dis);
        JMATInfo _matdata = _matReader.getInfo();

        _matReader.close();
        _matReader = null;
        dis = null;

        return !_matdata.sysIsCompressed();
    } catch (IOException ex) {
        return false;
    }
}

From source file:jadoop.util.SingleRecordSplitSequenceFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
        Path seqFilePath = file.getPath();

        SequenceFile.Reader reader = new SequenceFile.Reader(job.getConfiguration(), Reader.file(seqFilePath));

        Text key = new Text();
        TextArrayWritable val = new TextArrayWritable();

        long prevPos = 0;
        while (reader.next(key, val)) {
            long curPos = reader.getPosition();
            FileSplit split = new FileSplit(seqFilePath, prevPos, curPos - prevPos, null);
            splits.add(split);/* w  w  w.ja v a  2s  . co m*/
            prevPos = curPos;
        }

        reader.close();
    }

    return splits;
}

From source file:kafka.bridge.hadoop.KafkaOutputFormat.java

License:Apache License

public static Path getOutputPath(JobContext job) {
    String name = job.getConfiguration().get(KafkaOutputFormat.KAFKA_URL);
    return name == null ? null : new Path(name);
}

From source file:kogiri.common.hadoop.io.format.fasta.FastaRawReadInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    return codec == null;
}

From source file:kogiri.common.hadoop.io.format.fasta.FastaReadDescriptionInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    boolean splitable = FastaReadDescriptionInputFormat.isSplitable(context.getConfiguration());
    LOG.info("splitable = " + splitable);
    if (!splitable) {
        return false;
    }//www  .  j  a va  2 s .com

    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    if (codec != null) {
        return false;
    }

    return true;
}