Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:it.crs4.features.BioImgInputFormat.java

License:Apache License

public static String getMetadataFile(JobContext job) {
    return job.getConfiguration().get(META_FN);
}

From source file:it.crs4.pydoop.mapreduce.pipes.PipesNonJavaInputFormat.java

License:Apache License

public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {

    Configuration conf = context.getConfiguration();
    return ReflectionUtils
            .newInstance(conf.getClass(Submitter.INPUT_FORMAT, TextInputFormat.class, InputFormat.class), conf)
            .getSplits(context);/*from ww w .  java2 s  .c  om*/
}

From source file:it.crs4.pydoop.mapreduce.TextInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    return context.getConfiguration().getBoolean("pydoop.input.issplitable", true);
}

From source file:it.crs4.seal.tsv_sort.TextSampler.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 20 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param inFormat The input to sample/* ww  w .j  a v  a 2  s  .  c  o  m*/
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile)
        throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf);

    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT);
    List<InputSplit> splits = inFormat.getSplits(job);
    int samples = Math.min(MAX_SLICES_SAMPLED, splits.size());
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.size() / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        InputSplit isplit = splits.get(sampleStep * i);
        RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext);
        reader.initialize(isplit, taskContext);
        while (reader.nextKeyValue()) {
            sampler.addKey(reader.getCurrentKey());
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile))
        outFs.delete(partFile, false);

    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}

From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);

    //It generates the splits.
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
        Path filePath = file.getPath();

        //Calculates the content (array of double) length in bytes.
        FileSystem fs = filePath.getFileSystem(job.getConfiguration());
        FSDataInputStream dis = fs.open(filePath);
        JMATReader _matReader = new JMATReader(dis);
        JMATInfo _matdata = _matReader.getInfo();

        long length = _matdata.dataNumOfItems * MLDataType.miDOUBLE.bytes; //Content length.
        long lContentByteOffset = dis.getPos();

        _matReader.close();//from  w  w w. ja v a2  s  .  c  om
        _matReader = null;
        dis = null;

        //Zero bytes, empty file split.
        if (length <= 0) {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(filePath, 0, length, new String[0]));
        }

        //Split the data.
        if (length > 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                blkLocations = fs.getFileBlockLocations(file, lContentByteOffset, length);
            }

            boolean isSplittable = isSplitable(job, filePath);
            LOG.debug("Current file to process " + filePath.getName() + ". Splittable? " + isSplittable);
            if (isSplittable) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);

                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;

                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(
                            makeSplit(filePath, lBlockByteStart, splitSize, blkLocations[blkIndex].getHosts()));
                    bytesRemaining -= splitSize;
                } //EndWhile.

                if (bytesRemaining != 0) {
                    long lBlockByteStart = lContentByteOffset + length - bytesRemaining;
                    int blkIndex = getBlockIndex(blkLocations, lBlockByteStart);
                    splits.add(makeSplit(filePath, lBlockByteStart, bytesRemaining,
                            blkLocations[blkIndex].getHosts()));
                }
            } else { // not splitable
                splits.add(makeSplit(filePath, lContentByteOffset, length, blkLocations[0].getHosts()));
            }
        }
    } //EndFor.

    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    //A mat file can compress the single variables. When variables are
    //compressed, they cannot be splitted on Apache Spark. So, this 
    //piece of code determine whether the file is compressed or not and
    //whether it is splittable.
    try {//from  w ww . j  a  v a 2  s  .  c o m
        FileSystem fs = filename.getFileSystem(context.getConfiguration());
        FSDataInputStream dis = fs.open(filename);
        JMATReader _matReader = new JMATReader(dis);
        JMATInfo _matdata = _matReader.getInfo();

        _matReader.close();
        _matReader = null;
        dis = null;

        return !_matdata.sysIsCompressed();
    } catch (IOException ex) {
        return false;
    }
}

From source file:jadoop.util.SingleRecordSplitSequenceFileInputFormat.java

License:Open Source License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);

    for (FileStatus file : files) {
        Path seqFilePath = file.getPath();

        SequenceFile.Reader reader = new SequenceFile.Reader(job.getConfiguration(), Reader.file(seqFilePath));

        Text key = new Text();
        TextArrayWritable val = new TextArrayWritable();

        long prevPos = 0;
        while (reader.next(key, val)) {
            long curPos = reader.getPosition();
            FileSplit split = new FileSplit(seqFilePath, prevPos, curPos - prevPos, null);
            splits.add(split);/* w  w  w.ja v a  2s  . co m*/
            prevPos = curPos;
        }

        reader.close();
    }

    return splits;
}

From source file:kafka.bridge.hadoop.KafkaOutputFormat.java

License:Apache License

public static Path getOutputPath(JobContext job) {
    String name = job.getConfiguration().get(KafkaOutputFormat.KAFKA_URL);
    return name == null ? null : new Path(name);
}

From source file:kogiri.common.hadoop.io.format.fasta.FastaRawReadInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    return codec == null;
}

From source file:kogiri.common.hadoop.io.format.fasta.FastaReadDescriptionInputFormat.java

License:Open Source License

@Override
protected boolean isSplitable(JobContext context, Path filename) {
    boolean splitable = FastaReadDescriptionInputFormat.isSplitable(context.getConfiguration());
    LOG.info("splitable = " + splitable);
    if (!splitable) {
        return false;
    }//www  .  j  a va  2 s .com

    CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename);
    if (codec != null) {
        return false;
    }

    return true;
}