List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:it.crs4.features.BioImgInputFormat.java
License:Apache License
public static String getMetadataFile(JobContext job) { return job.getConfiguration().get(META_FN); }
From source file:it.crs4.pydoop.mapreduce.pipes.PipesNonJavaInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { Configuration conf = context.getConfiguration(); return ReflectionUtils .newInstance(conf.getClass(Submitter.INPUT_FORMAT, TextInputFormat.class, InputFormat.class), conf) .getSplits(context);/*from ww w . java2 s .c om*/ }
From source file:it.crs4.pydoop.mapreduce.TextInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { return context.getConfiguration().getBoolean("pydoop.input.issplitable", true); }
From source file:it.crs4.seal.tsv_sort.TextSampler.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 20 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param inFormat The input to sample/* ww w .j a v a 2 s . c o m*/ * @param conf the job to sample * @param partFile where to write the output file to * @throws IOException if something goes wrong */ public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf); TextSampler sampler = new TextSampler(); Text key = new Text(); Text value = new Text(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT); List<InputSplit> splits = inFormat.getSplits(job); int samples = Math.min(MAX_SLICES_SAMPLED, splits.size()); long recordsPerSample = sampleSize / samples; int sampleStep = splits.size() / samples; long records = 0; // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { InputSplit isplit = splits.get(sampleStep * i); RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext); reader.initialize(isplit, taskContext); while (reader.nextKeyValue()) { sampler.addKey(reader.getCurrentKey()); records += 1; if ((i + 1) * recordsPerSample <= records) { break; } } } FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) outFs.delete(partFile, false); SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); for (Text split : sampler.createPartitions(partitions)) { writer.append(split, nullValue); } writer.close(); }
From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); //It generates the splits. List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path filePath = file.getPath(); //Calculates the content (array of double) length in bytes. FileSystem fs = filePath.getFileSystem(job.getConfiguration()); FSDataInputStream dis = fs.open(filePath); JMATReader _matReader = new JMATReader(dis); JMATInfo _matdata = _matReader.getInfo(); long length = _matdata.dataNumOfItems * MLDataType.miDOUBLE.bytes; //Content length. long lContentByteOffset = dis.getPos(); _matReader.close();//from w w w. ja v a2 s . c om _matReader = null; dis = null; //Zero bytes, empty file split. if (length <= 0) { //Create empty hosts array for zero length files splits.add(makeSplit(filePath, 0, length, new String[0])); } //Split the data. if (length > 0) { BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(file, lContentByteOffset, length); } boolean isSplittable = isSplitable(job, filePath); LOG.debug("Current file to process " + filePath.getName() + ". Splittable? " + isSplittable); if (isSplittable) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { long lBlockByteStart = lContentByteOffset + length - bytesRemaining; int blkIndex = getBlockIndex(blkLocations, lBlockByteStart); splits.add( makeSplit(filePath, lBlockByteStart, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } //EndWhile. if (bytesRemaining != 0) { long lBlockByteStart = lContentByteOffset + length - bytesRemaining; int blkIndex = getBlockIndex(blkLocations, lBlockByteStart); splits.add(makeSplit(filePath, lBlockByteStart, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // not splitable splits.add(makeSplit(filePath, lContentByteOffset, length, blkLocations[0].getHosts())); } } } //EndFor. // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path filename) { //A mat file can compress the single variables. When variables are //compressed, they cannot be splitted on Apache Spark. So, this //piece of code determine whether the file is compressed or not and //whether it is splittable. try {//from w ww . j a v a 2 s . c o m FileSystem fs = filename.getFileSystem(context.getConfiguration()); FSDataInputStream dis = fs.open(filename); JMATReader _matReader = new JMATReader(dis); JMATInfo _matdata = _matReader.getInfo(); _matReader.close(); _matReader = null; dis = null; return !_matdata.sysIsCompressed(); } catch (IOException ex) { return false; } }
From source file:jadoop.util.SingleRecordSplitSequenceFileInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path seqFilePath = file.getPath(); SequenceFile.Reader reader = new SequenceFile.Reader(job.getConfiguration(), Reader.file(seqFilePath)); Text key = new Text(); TextArrayWritable val = new TextArrayWritable(); long prevPos = 0; while (reader.next(key, val)) { long curPos = reader.getPosition(); FileSplit split = new FileSplit(seqFilePath, prevPos, curPos - prevPos, null); splits.add(split);/* w w w.ja v a 2s . co m*/ prevPos = curPos; } reader.close(); } return splits; }
From source file:kafka.bridge.hadoop.KafkaOutputFormat.java
License:Apache License
public static Path getOutputPath(JobContext job) { String name = job.getConfiguration().get(KafkaOutputFormat.KAFKA_URL); return name == null ? null : new Path(name); }
From source file:kogiri.common.hadoop.io.format.fasta.FastaRawReadInputFormat.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path filename) { CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); return codec == null; }
From source file:kogiri.common.hadoop.io.format.fasta.FastaReadDescriptionInputFormat.java
License:Open Source License
@Override protected boolean isSplitable(JobContext context, Path filename) { boolean splitable = FastaReadDescriptionInputFormat.isSplitable(context.getConfiguration()); LOG.info("splitable = " + splitable); if (!splitable) { return false; }//www . j a va 2 s .com CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(filename); if (codec != null) { return false; } return true; }