List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java
License:Open Source License
/** * //from ww w . j av a 2 s.c o m * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ @SuppressWarnings("unchecked") public void addHeaderToCSV(String srcFileName, String destFileName, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path destFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (!_props.hasHeader()) { // simply move srcFile to destFile /* * TODO: Remove this roundabout way! * For example: destFilePath = /user/biadmin/csv/temp/out/file.csv * & the only path that exists already on HDFS is /user/biadmin/csv/. * In this case: the directory structure /user/biadmin/csv/temp/out must be created. * Simple hdfs.rename() does not seem to create this directory structure. */ // delete the destination file, if exists already //boolean ret1 = hdfs.delete(destFilePath, true); // Create /user/biadmin/csv/temp/out/file.csv so that ..../temp/out/ is created. //boolean ret2 = hdfs.createNewFile(destFilePath); // delete the file "file.csv" but preserve the directory structure /user/biadmin/csv/temp/out/ //boolean ret3 = hdfs.delete(destFilePath, true); // finally, move the data to destFilePath = /user/biadmin/csv/temp/out/file.csv //boolean ret4 = hdfs.rename(srcFilePath, destFilePath); //System.out.println("Return values = del:" + ret1 + ", createNew:" + ret2 + ", del:" + ret3 + ", rename:" + ret4); return; } // construct the header line StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(_props.getDelim()); } sb.append('\n'); if (hdfs.isDirectory(srcFilePath)) { // compute sorted order among part files ArrayList<Path> files = new ArrayList<Path>(); for (FileStatus stat : hdfs.listStatus(srcFilePath, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); // first part file path Path firstpart = files.get(0); // create a temp file, and add header and contents of first part Path tmp = new Path(firstpart.toString() + ".tmp"); OutputStream out = hdfs.create(tmp, true); out.write(sb.toString().getBytes()); sb.setLength(0); // copy rest of the data from firstpart InputStream in = null; try { in = hdfs.open(firstpart); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } // rename tmp to firstpart hdfs.delete(firstpart, true); hdfs.rename(tmp, firstpart); // rename srcfile to destFile hdfs.delete(destFilePath, true); hdfs.createNewFile(destFilePath); // force the creation of directory structure hdfs.delete(destFilePath, true); // delete the file, but preserve the directory structure hdfs.rename(srcFilePath, destFilePath); // move the data } else if (hdfs.isFile(srcFilePath)) { // create destination file OutputStream out = hdfs.create(destFilePath, true); // write header out.write(sb.toString().getBytes()); sb.setLength(0); // copy the data from srcFile InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSVParallel.java
License:Open Source License
/** * //from w ww .java 2s.c o m * @param fileName * @param src * @param rlen * @param clen * @param nnz * @throws IOException */ @Override protected void writeCSVMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen, long nnz, CSVFileFormatProperties props) throws IOException { //estimate output size and number of output blocks (min 1) int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(), src.getNonZeros(), OutputInfo.CSVOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize()); numPartFiles = Math.max(numPartFiles, 1); //determine degree of parallelism int numThreads = OptimizerUtils.getParallelTextWriteParallelism(); numThreads = Math.min(numThreads, numPartFiles); //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file if (numThreads <= 1) { super.writeCSVMatrixToHDFS(path, job, src, rlen, clen, nnz, props); return; } //create directory for concurrent tasks MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION); //create and execute tasks try { ExecutorService pool = Executors.newFixedThreadPool(numThreads); ArrayList<WriteCSVTask> tasks = new ArrayList<WriteCSVTask>(); int blklen = (int) Math.ceil((double) rlen / numThreads); for (int i = 0; i < numThreads & i * blklen < rlen; i++) { Path newPath = new Path(path, String.format("0-m-%05d", i)); tasks.add(new WriteCSVTask(newPath, job, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen), props)); } //wait until all tasks have been executed List<Future<Object>> rt = pool.invokeAll(tasks); pool.shutdown(); //check for exceptions for (Future<Object> task : rt) task.get(); } catch (Exception e) { throw new IOException("Failed parallel write of csv output.", e); } }
From source file:com.ibm.bi.dml.runtime.matrix.CleanupMR.java
License:Open Source License
/** * //from w ww .ja v a 2 s .c o m * @param path * @param numTasks * @throws DMLRuntimeException * @throws IOException */ private static void writeCleanupTasksToFile(Path path, int numTasks) throws DMLRuntimeException, IOException { BufferedWriter br = null; try { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true))); for (int i = 1; i <= numTasks; i++) br.write(String.valueOf("CLEANUP TASK " + i) + "\n"); } catch (Exception ex) { throw new DMLRuntimeException("Error writing cleanup tasks to taskfile " + path.toString(), ex); } finally { if (br != null) br.close(); } }
From source file:com.ibm.bi.dml.runtime.matrix.CSVReblockMR.java
License:Open Source License
private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception { JobConf job;// w w w .j a va 2 s .co m job = new JobConf(ReblockMR.class); job.setJobName("CSV-Reblock-MR"); byte[] realIndexes = new byte[inputs.length]; for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b; //set up the input files and their format information MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL); job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles); //set up the dimensions of input matrices MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); //set up the block size MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); //set up the aggregate instructions that will happen in the combiner and reducer MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer); //set up the replication factor for the results job.setInt("dfs.replication", replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, reblockInstructions, null, otherInstructionsInReducer, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false); MatrixCharacteristics[] stats = ret.stats; //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed); // Print the complete instruction //if (LOG.isTraceEnabled()) // inst.printCompelteMRJobInstruction(stats); // Update resultDimsUnknown based on computed "stats" byte[] resultDimsUnknown = new byte[resultIndexes.length]; for (int i = 0; i < resultIndexes.length; i++) { if (stats[i].getRows() == -1 || stats[i].getCols() == -1) { resultDimsUnknown[i] = (byte) 1; } else { resultDimsUnknown[i] = (byte) 0; } } //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true); // configure mapper and the mapper output key value pairs job.setMapperClass(CSVReblockMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); Path cachefile = new Path(counterFile, "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(ROWID_FILE_NAME, cachefile.toString()); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job); /* Process different counters */ Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { // number of non-zeros stats[i].setNonZeros(group.getCounter(Integer.toString(i))); // System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]); } return new JobReturn(stats, outputInfos, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} to the list of * inputs for the map-reduce job.//ww w . j a v a2 s . c o m * * @param conf The configuration of the job * @param path {@link Path} to be added to the list of inputs for the job * @param inputFormatClass {@link InputFormat} class to use for this path */ public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass) { String inputFormatMapping = path.toString() + ";" + inputFormatClass.getName(); String inputFormats = conf.get("mapred.input.dir.formats"); conf.set("mapred.input.dir.formats", inputFormats == null ? inputFormatMapping : inputFormats + "," + inputFormatMapping); conf.setInputFormat(DelegatingInputFormat.class); }
From source file:com.ibm.bi.dml.runtime.matrix.data.hadoopfix.MultipleInputs.java
License:Apache License
/** * Add a {@link Path} with a custom {@link InputFormat} and * {@link Mapper} to the list of inputs for the map-reduce job. * //from ww w .ja v a2s .c o m * @param conf The configuration of the job * @param path {@link Path} to be added to the list of inputs for the job * @param inputFormatClass {@link InputFormat} class to use for this path * @param mapperClass {@link Mapper} class to use for this path */ public static void addInputPath(JobConf conf, Path path, Class<? extends InputFormat> inputFormatClass, Class<? extends Mapper> mapperClass) { addInputPath(conf, path, inputFormatClass); String mapperMapping = path.toString() + ";" + mapperClass.getName(); String mappers = conf.get("mapred.input.dir.mappers"); conf.set("mapred.input.dir.mappers", mappers == null ? mapperMapping : mappers + "," + mapperMapping); conf.setMapperClass(DelegatingMapper.class); }
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
@Override public void setupJob(JobContext context) throws IOException { super.setupJob(context); // get output file directories and create directories JobConf conf = context.getJobConf(); String[] loutputs = MRJobConfiguration.getOutputs(conf); for (String dir : loutputs) { Path path = new Path(dir); FileSystem fs = path.getFileSystem(conf); if (!fs.mkdirs(path)) LOG.error("Mkdirs failed to create " + path.toString()); }//from w ww . j av a2 s . co m }
From source file:com.ibm.bi.dml.runtime.matrix.data.MultipleOutputCommitter.java
License:Open Source License
@Override public void commitTask(TaskAttemptContext context) throws IOException { JobConf conf = context.getJobConf(); TaskAttemptID attemptId = context.getTaskAttemptID(); // get the mapping between index to output filename outputs = MRJobConfiguration.getOutputs(conf); //get temp task output path (compatible with hadoop1 and hadoop2) Path taskOutPath = FileOutputFormat.getWorkOutputPath(conf); FileSystem fs = taskOutPath.getFileSystem(conf); if (!fs.exists(taskOutPath)) throw new IOException("Task output path " + taskOutPath.toString() + "does not exist."); // Move the task outputs to their final places context.getProgressible().progress(); moveFinalTaskOutputs(context, fs, taskOutPath); // Delete the temporary task-specific output directory if (!fs.delete(taskOutPath, true)) LOG.debug(//www . jav a2 s . co m "Failed to delete the temporary output directory of task: " + attemptId + " - " + taskOutPath); }
From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVAssignRowIDMapper.java
License:Open Source License
@Override @SuppressWarnings("deprecation") public void configure(JobConf job) { byte thisIndex; try {/*from w w w. ja va 2s. co m*/ //it doesn't make sense to have repeated file names in the input, since this is for reblock thisIndex = MRJobConfiguration.getInputMatrixIndexesInMapper(job).get(0); outKey.set(thisIndex); FileSystem fs = FileSystem.get(job); Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs); filename = thisPath.toString(); String[] strs = job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT); Path headerPath = new Path(strs[thisIndex]).makeQualified(fs); if (headerPath.toString().equals(filename)) headerFile = true; } catch (IOException e) { throw new RuntimeException(e); } try { CSVReblockInstruction[] reblockInstructions = MRJobConfiguration.getCSVReblockInstructions(job); for (CSVReblockInstruction ins : reblockInstructions) { if (ins.input == thisIndex) { delim = Pattern.quote(ins.delim); ignoreFirstLine = ins.hasHeader; break; } } } catch (DMLUnsupportedOperationException e) { throw new RuntimeException(e); } catch (DMLRuntimeException e) { throw new RuntimeException(e); } // load properties relevant to transform try { boolean omit = job.getBoolean(MRJobConfiguration.TF_TRANSFORM, false); if (omit) _agents = new TfUtils(job, true); } catch (IOException e) { throw new RuntimeException(e); } catch (JSONException e) { throw new RuntimeException(e); } }
From source file:com.ibm.bi.dml.runtime.matrix.mapred.CSVReblockMapper.java
License:Open Source License
@Override @SuppressWarnings("deprecation") public void configure(JobConf job) { super.configure(job); //get the number colums per block //load the offset mapping byte matrixIndex = representativeMatrixes.get(0); try {//from w w w. ja va2 s. co m FileSystem fs = FileSystem.get(job); Path thisPath = new Path(job.get("map.input.file")).makeQualified(fs); String filename = thisPath.toString(); Path headerPath = new Path(job.getStrings(CSVReblockMR.SMALLEST_FILE_NAME_PER_INPUT)[matrixIndex]) .makeQualified(fs); if (headerPath.toString().equals(filename)) headerFile = true; ByteWritable key = new ByteWritable(); OffsetCount value = new OffsetCount(); Path p = new Path(job.get(CSVReblockMR.ROWID_FILE_NAME)); SequenceFile.Reader reader = new SequenceFile.Reader(fs, p, job); while (reader.next(key, value)) { if (key.get() == matrixIndex && filename.equals(value.filename)) offsetMap.put(value.fileOffset, value.count); } reader.close(); } catch (IOException e) { throw new RuntimeException(e); } CSVReblockInstruction ins = csv_reblock_instructions.get(0).get(0); _delim = ins.delim; ignoreFirstLine = ins.hasHeader; idxRow = new IndexedBlockRow(); int maxBclen = 0; for (ArrayList<CSVReblockInstruction> insv : csv_reblock_instructions) for (CSVReblockInstruction in : insv) { if (maxBclen < in.bclen) maxBclen = in.bclen; } //always dense since common csv usecase idxRow.getRow().data.reset(1, maxBclen, false); }