List of usage examples for org.apache.hadoop.fs Path getFileSystem
public FileSystem getFileSystem(Configuration conf) throws IOException
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
@Override public void setupJob(JobContext context) throws IOException { super.setupJob(context); // get output file directories and create directories JobConf conf = context.getJobConf(); String[] loutputs = MRJobConfiguration.getOutputs(conf); for (String dir : loutputs) { Path path = new Path(dir); FileSystem fs = path.getFileSystem(conf); if (!fs.mkdirs(path)) LOG.error("Mkdirs failed to create " + path.toString()); }/*from w w w. j av a 2s . c o m*/ }
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
@Override public void cleanupJob(JobContext context) throws IOException { JobConf conf = context.getJobConf(); // do the clean up of temporary directory Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { FileSystem fs = outputPath.getFileSystem(conf); context.getProgressible().progress(); if (fs.exists(outputPath)) fs.delete(outputPath, true); }//from ww w . j av a 2 s .c om }
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
@Override public void commitTask(TaskAttemptContext context) throws IOException { JobConf conf = context.getJobConf(); TaskAttemptID attemptId = context.getTaskAttemptID(); // get the mapping between index to output filename outputs = MRJobConfiguration.getOutputs(conf); //get temp task output path (compatible with hadoop1 and hadoop2) Path taskOutPath = FileOutputFormat.getWorkOutputPath(conf); FileSystem fs = taskOutPath.getFileSystem(conf); if (!fs.exists(taskOutPath)) throw new IOException("Task output path " + taskOutPath.toString() + "does not exist."); // Move the task outputs to their final places context.getProgressible().progress(); moveFinalTaskOutputs(context, fs, taskOutPath); // Delete the temporary task-specific output directory if (!fs.delete(taskOutPath, true)) LOG.debug(/*from w ww. j a v a 2s. c o m*/ "Failed to delete the temporary output directory of task: " + attemptId + " - " + taskOutPath); }
From source file:com.ibm.bi.dml.runtime.matrix.data.UnPaddedOutputFormat.java
License:Open Source License
@Override public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, true, job.getInt("io.file.buffer.size", 4096), progress); return new UnpaddedRecordWriter<K, V>(fileOut); }
From source file:com.ibm.bi.dml.runtime.matrix.sort.CompactOutputFormat.java
License:Open Source License
public RecordWriter<K, V> getRecordWriter(FileSystem ignored, JobConf job, String name, Progressable progress) throws IOException { Path file = FileOutputFormat.getTaskOutputPath(job, name); FileSystem fs = file.getFileSystem(job); FSDataOutputStream fileOut = fs.create(file, progress); return new FixedLengthRecordWriter<K, V>(fileOut, job); }
From source file:com.ibm.bi.dml.runtime.matrix.sort.SamplingSortMRInputFormat.java
License:Open Source License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param conf the job to sample/*w w w.j a v a 2 s . c om*/ * @param partFile where to write the output file to * @throws IOException if something goes wrong * @throws IllegalAccessException * @throws InstantiationException */ @SuppressWarnings({ "unchecked", "unused", "deprecation" }) public static int writePartitionFile(JobConf conf, Path partFile) throws IOException, InstantiationException, IllegalAccessException { SamplingSortMRInputFormat inFormat = new SamplingSortMRInputFormat(); Sampler sampler = new Sampler(); Class<? extends WritableComparable> targetKeyClass; targetKeyClass = (Class<? extends WritableComparable>) conf.getClass(TARGET_KEY_CLASS, WritableComparable.class); //get input converter information int brlen = MRJobConfiguration.getNumRowsPerBlock(conf, (byte) 0); int bclen = MRJobConfiguration.getNumColumnsPerBlock(conf, (byte) 0); //indicate whether the matrix value in this mapper is a matrix cell or a matrix block int partitions = conf.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 1000); InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks()); int samples = Math.min(10, splits.length); long recordsPerSample = sampleSize / samples; int sampleStep = splits.length / samples; // take N samples from different parts of the input int totalcount = 0; for (int i = 0; i < samples; ++i) { SequenceFileRecordReader reader = (SequenceFileRecordReader) inFormat .getRecordReader(splits[sampleStep * i], conf, null); int count = 0; WritableComparable key = (WritableComparable) reader.createKey(); Writable value = (Writable) reader.createValue(); while (reader.next(key, value) && count < recordsPerSample) { Converter inputConverter = MRJobConfiguration.getInputConverter(conf, (byte) 0); inputConverter.setBlockSize(brlen, bclen); inputConverter.convert(key, value); while (inputConverter.hasNext()) { Pair pair = inputConverter.next(); if (pair.getKey() instanceof DoubleWritable) { sampler.addValue(new DoubleWritable(((DoubleWritable) pair.getKey()).get())); } else if (pair.getValue() instanceof MatrixCell) { sampler.addValue(new DoubleWritable(((MatrixCell) pair.getValue()).getValue())); } else throw new IOException("SamplingSortMRInputFormat unsupported key/value class: " + pair.getKey().getClass() + ":" + pair.getValue().getClass()); count++; } key = (WritableComparable) reader.createKey(); value = (Writable) reader.createValue(); } totalcount += count; } if (totalcount == 0) //empty input files sampler.addValue(new DoubleWritable(0)); FileSystem outFs = partFile.getFileSystem(conf); if (outFs.exists(partFile)) { outFs.delete(partFile, false); } //note: key value always double/null as expected by partitioner SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, DoubleWritable.class, NullWritable.class); NullWritable nullValue = NullWritable.get(); int index0 = -1, i = 0; boolean lessthan0 = true; for (WritableComparable splitValue : sampler.createPartitions(partitions)) { writer.append(splitValue, nullValue); if (lessthan0 && ((DoubleWritable) splitValue).get() >= 0) { index0 = i; lessthan0 = false; } i++; } if (lessthan0) index0 = partitions - 1; writer.close(); return index0; }
From source file:com.ibm.bi.dml.runtime.matrix.SortMR.java
License:Open Source License
@SuppressWarnings({ "unchecked", "rawtypes" }) public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception { boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes; String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output; JobConf job = new JobConf(SortMR.class); job.setJobName("SortMR"); //setup partition file String pfname = MRJobConfiguration.setUpSortPartitionFilename(job); Path partitionFile = new Path(pfname); URI partitionUri = new URI(partitionFile.toString()); //setup input/output paths Path inputDir = new Path(input); inputDir = inputDir.makeQualified(inputDir.getFileSystem(job)); SamplingSortMRInputFormat.setInputPaths(job, inputDir); Path outpath = new Path(tmpOutput); FileOutputFormat.setOutputPath(job, outpath); MapReduceTool.deleteFileIfExistOnHDFS(outpath, job); //set number of reducers (1 if local mode) if (InfrastructureAnalyzer.isLocalMode(job)) job.setNumReduceTasks(1);//from w w w. j av a 2 s. co m else MRJobConfiguration.setNumReducers(job, numReducers, numReducers); //setup input/output format job.setInputFormat(SamplingSortMRInputFormat.class); SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass); //setup instructions and meta information if (combineInst != null && !combineInst.trim().isEmpty()) job.set(COMBINE_INSTRUCTION, combineInst); job.set(SORT_INSTRUCTION, sortInst); job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight); boolean desc = getSortInstructionDescending(sortInst); job.setBoolean(SORT_DECREASING, desc); MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen); MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile); //setup mapper/reducer/partitioner/output classes if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) { MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass); job.setMapperClass(IndexSortMapper.class); job.setReducerClass(IndexSortReducer.class); job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } else { //default case: SORT w/wo weights MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL); job.setOutputFormat(CompactOutputFormat.class); job.setMapperClass(ValueSortMapper.class); job.setReducerClass(ValueSortReducer.class); job.setOutputKeyClass(outputInfo.outputKeyClass); //double job.setOutputValueClass(outputInfo.outputValueClass); //int } job.setPartitionerClass(TotalOrderPartitioner.class); //setup distributed cache DistributedCache.addCacheFile(partitionUri, job); DistributedCache.createSymlink(job); //setup replication factor job.setInt("dfs.replication", replication); MatrixCharacteristics[] s = new MatrixCharacteristics[1]; s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen); // Print the complete instruction if (LOG.isTraceEnabled()) inst.printCompleteMRJobInstruction(s); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); //run mr job RunningJob runjob = JobClient.runJob(job); Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX); numReducers = job.getNumReduceTasks(); //process final meta data long[] counts = new long[numReducers]; long total = 0; for (int i = 0; i < numReducers; i++) { counts[i] = group.getCounter(Integer.toString(i)); total += counts[i]; } //add missing 0s back to the results long missing0s = 0; if (total < rlen * clen) { if (partitionWith0 < 0) throw new RuntimeException("no partition contains 0, which is wrong!"); missing0s = rlen * clen - total; counts[partitionWith0] += missing0s; } else partitionWith0 = -1; if (sortIndexes) { //run builtin job for shifting partially sorted blocks according to global offsets //we do this in this custom form since it would not fit into the current structure //of systemml to output two intermediates (partially sorted data, offsets) out of a //single SortKeys lop boolean success = runjob.isSuccessful(); if (success) { success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output); } MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput); MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success); } else { MapReduceTool.deleteFileIfExistOnHDFS(pfname); return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful()); } }
From source file:com.ibm.jaql.io.hadoop.CompositeOutputAdapter.java
License:Apache License
@Override public void checkOutputSpecs(FileSystem ignored, JobConf conf) throws IOException { for (int i = 0; i < outputs.length; i++) { outputs[i].checkOutputSpecs(ignored, subconfs[i]); // HACK: Hadoop 0.18 has hacks that specialize FileOutputFormat handling. In particular, // the temporary directory is created by the Task or LocalJobRunner; they also promote // the temporary files to the parent upon completion. We create the temporary file here, // if it doesn't already exist. On Path outputPath = FileOutputFormat.getOutputPath(subconfs[i]); if (outputPath != null) { final String TEMP_DIR_NAME = "_temporary"; // MRConstants isn't public... Path jobTmpDir = new Path(outputPath, TEMP_DIR_NAME); // MRConstants.TEMP_DIR_NAME FileSystem fs = jobTmpDir.getFileSystem(subconfs[i]); if (!fs.exists(jobTmpDir)) { fs.mkdirs(jobTmpDir);/*ww w . j av a2 s . c o m*/ } } } }
From source file:com.ibm.jaql.io.hadoop.DirectFileOutputCommiter.java
License:Apache License
@Override public void setupJob(JobContext context) throws IOException { // Create the path to the file, if needed. JobConf conf = context.getJobConf(); Path outputPath = FileOutputFormat.getOutputPath(conf); if (outputPath != null) { Path tmpDir = outputPath.getParent(); FileSystem fileSys = outputPath.getFileSystem(conf); if (!fileSys.mkdirs(outputPath.getParent())) { throw new IOException("Mkdirs failed to create " + tmpDir.toString()); }/*w ww.ja v a 2s . c om*/ } }
From source file:com.ibm.jaql.io.hadoop.FileOutputConfigurator.java
License:Apache License
public void setSequential(JobConf conf) throws Exception { registerSerializers(conf);// w w w .j a v a 2s.co m // For an expression, the location is the final file name Path outPath = new Path(location); FileSystem fs = outPath.getFileSystem(conf); outPath = outPath.makeQualified(fs); if (fs.exists(outPath)) { // TODO: Jaql currently has overwrite semantics; add flag to control this if (fs.isFile(outPath)) { fs.delete(outPath, false); } else { // Look for a map-reduce output directory FileStatus[] nonMR = fs.listStatus(outPath, new PathFilter() { boolean onlyOne = true; public boolean accept(Path path) { String name = path.getName(); if (name.matches("([.][.]?)|([.]part-[0-9]+.crc)|(part-[0-9]+)")) { return false; } if (onlyOne) { onlyOne = false; return true; } return false; } }); if (nonMR.length > 0) { throw new IOException( "directory exists and is not a map-reduce output directory: " + nonMR[0].getPath()); } fs.delete(outPath, true); } } // In sequential mode, we will write directly to the output file // and bypass the _temporary directory and rename of the standard // FileOutputCommitter by using our own DirectFileOutputCommitter. FileOutputFormat.setOutputPath(conf, outPath.getParent()); conf.setClass("mapred.output.committer.class", DirectFileOutputCommiter.class, OutputCommitter.class); }