List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java
License:Open Source License
/** * //from www.j a v a 2 s. c om * @param fname null if no comparison required * @param fnameNew * @param srcFnames * @param ii * @param oi * @param rlen * @param clen * @param brlen * @param bclen * @throws DMLRuntimeException */ @SuppressWarnings({ "unused", "deprecation" }) protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException { String jobname = "ParFor-RMMR"; long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0; JobConf job; job = new JobConf(ResultMergeRemoteMR.class); job.setJobName(jobname + _pfid); //maintain dml script counters Statistics.incrementNoOfCompiledMRJobs(); //warning for textcell/binarycell without compare boolean withCompare = (fname != null); if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare && ResultMergeLocalFile.ALLOW_COPY_CELLFILES) LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi) + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR."); try { Path pathCompare = null; Path pathNew = new Path(fnameNew); ///// //configure the MR job if (withCompare) { pathCompare = new Path(fname).makeQualified(FileSystem.get(job)); MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen, bclen); } else MRJobConfiguration.setResultMergeInfo(job, "null", ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen, bclen); //set mappers, reducers, combiners job.setMapperClass(ResultMergeRemoteMapper.class); job.setReducerClass(ResultMergeRemoteReducer.class); if (oi == OutputInfo.TextCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); } else if (oi == OutputInfo.BinaryCellOutputInfo) { job.setMapOutputKeyClass(MatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixCell.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixCell.class); } else if (oi == OutputInfo.BinaryBlockOutputInfo) { //setup partitioning, grouping, sorting for composite key (old API) job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class); job.setMapOutputValueClass(TaggedMatrixBlock.class); job.setOutputKeyClass(MatrixIndexes.class); job.setOutputValueClass(MatrixBlock.class); } //set input format job.setInputFormat(ii.inputFormatClass); //set the input path Path[] paths = null; if (withCompare) { paths = new Path[srcFnames.length + 1]; paths[0] = pathCompare; for (int i = 1; i < paths.length; i++) paths[i] = new Path(srcFnames[i - 1]); } else { paths = new Path[srcFnames.length]; for (int i = 0; i < paths.length; i++) paths[i] = new Path(srcFnames[i]); } FileInputFormat.setInputPaths(job, paths); //set output format job.setOutputFormat(oi.outputFormatClass); //set output path MapReduceTool.deleteFileIfExistOnHDFS(fnameNew); FileOutputFormat.setOutputPath(job, pathNew); ////// //set optimization parameters //set the number of mappers and reducers //job.setNumMapTasks( _numMappers ); //use default num mappers long reducerGroups = _numReducers; if (oi == OutputInfo.BinaryBlockOutputInfo) reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1); else //textcell/binarycell reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1); job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups)); //use FLEX scheduler configuration properties if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) { job.setInt("flex.map.min", 0); job.setInt("flex.map.max", _numMappers); job.setInt("flex.reduce.min", 0); job.setInt("flex.reduce.max", _numMappers); } //disable automatic tasks timeouts and speculative task exec job.setInt("mapred.task.timeout", 0); job.setMapSpeculativeExecution(false); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //enables the reuse of JVMs (multiple tasks per MR task) if (_jvmReuse) job.setNumTasksToExecutePerJvm(-1); //unlimited //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower) //job.set("mapred.compress.map.output", "true"); //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); //set the replication factor for the results job.setInt("dfs.replication", _replication); //set the max number of retries per map task // disabled job-level configuration to respect cluster configuration // note: this refers to hadoop2, hence it never had effect on mr1 //job.setInt("mapreduce.map.maxattempts", _max_retry); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); ///// // execute the MR job JobClient.runJob(job); //maintain dml script counters Statistics.incrementNoOfExecutedMRJobs(); } catch (Exception ex) { throw new DMLRuntimeException(ex); } if (DMLScript.STATISTICS) { long t1 = System.nanoTime(); Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0); } }
From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java
License:Open Source License
/** * /* ww w . j ava 2 s. c o m*/ * @param fs * @param path * @throws IOException */ protected static void checkValidInputFile(FileSystem fs, Path path) throws IOException { //check non-existing file if (!fs.exists(path)) throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS."); //check for empty file if (MapReduceTool.isFileEmpty(fs, path.toString())) throw new EOFException("Empty input file " + path.toString() + "."); }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java
License:Open Source License
/** * //from ww w. ja va 2 s . c o m * @param path * @param job * @param fs * @param dest * @param rlen * @param clen * @param brlen * @param bclen * @param hasHeader * @param delim * @param fill * @param fillValue * @return * @throws IOException */ @SuppressWarnings("unchecked") private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen, long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue) throws IOException { ArrayList<Path> files = new ArrayList<Path>(); if (fs.isDirectory(path)) { for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter)) files.add(stat.getPath()); Collections.sort(files); } else files.add(path); if (dest == null) { dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue); clen = dest.getNumColumns(); } boolean sparse = dest.isInSparseFormat(); ///////////////////////////////////////// String value = null; int row = 0; int col = -1; double cellValue = 0; long lnnz = 0; for (int fileNo = 0; fileNo < files.size(); fileNo++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo)))); if (fileNo == 0 && hasHeader) br.readLine(); //ignore header // Read the data boolean emptyValuesFound = false; try { if (sparse) //SPARSE<-value { while ((value = br.readLine()) != null) //foreach line { String cellStr = value.toString().trim(); emptyValuesFound = false; String[] parts = IOUtilFunctions.split(cellStr, delim); col = 0; for (String part : parts) //foreach cell { part = part.trim(); if (part.isEmpty()) { emptyValuesFound = true; cellValue = fillValue; } else { cellValue = UtilFunctions.parseToDouble(part); } if (cellValue != 0) { dest.appendValue(row, col, cellValue); lnnz++; } col++; } //sanity checks for empty values and number of columns IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen); row++; } } else //DENSE<-value { while ((value = br.readLine()) != null) //foreach line { String cellStr = value.toString().trim(); emptyValuesFound = false; String[] parts = IOUtilFunctions.split(cellStr, delim); col = 0; for (String part : parts) //foreach cell { part = part.trim(); if (part.isEmpty()) { emptyValuesFound = true; cellValue = fillValue; } else { cellValue = UtilFunctions.parseToDouble(part); } if (cellValue != 0) { dest.setValueDenseUnsafe(row, col, cellValue); lnnz++; } col++; } //sanity checks for empty values and number of columns IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound); IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen); row++; } } } finally { IOUtilFunctions.closeSilently(br); } } //post processing dest.setNonZeros(lnnz); return dest; }
From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSVParallel.java
License:Open Source License
@Override public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz) throws IOException, DMLRuntimeException { // prepare file access JobConf job = new JobConf(ConfigurationManager.getCachedJobConf()); FileSystem fs = FileSystem.get(job); Path path = new Path(fname); FileInputFormat.addInputPath(job, path); TextInputFormat informat = new TextInputFormat(); informat.configure(job);/*from w ww .jav a 2 s.c o m*/ InputSplit[] splits = informat.getSplits(job, _numThreads); if (splits[0] instanceof FileSplit) { // The splits do not always arrive in order by file name. // Sort the splits lexicographically by path so that the header will // be in the first split. // Note that we're assuming that the splits come in order by offset Arrays.sort(splits, new Comparator<InputSplit>() { @Override public int compare(InputSplit o1, InputSplit o2) { Path p1 = ((FileSplit) o1).getPath(); Path p2 = ((FileSplit) o2).getPath(); return p1.toString().compareTo(p2.toString()); } }); } // check existence and non-empty file checkValidInputFile(fs, path); // allocate output matrix block // First Read Pass (count rows/cols, determine offsets, allocate matrix block) MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(), _props.getDelim(), estnnz); rlen = ret.getNumRows(); clen = ret.getNumColumns(); // Second Read Pass (read, parse strings, append to matrix block) readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(), _props.getDelim(), _props.isFill(), _props.getFillValue()); //post-processing (representation-specific, change of sparse/dense block representation) // - no sorting required for CSV because it is read in sorted order per row // - nnz explicitly maintained in parallel for the individual splits ret.examSparsity(); // sanity check for parallel row count (since determined internally) if (rlen > 0 && rlen != ret.getNumRows()) throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow=" + rlen + ", real nrow=" + ret.getNumRows()); return ret; }
From source file:com.ibm.bi.dml.runtime.io.WriterBinaryBlock.java
License:Open Source License
/** * /*from w ww.jav a2 s .co m*/ * @param path * @param job * @param src * @param rlen * @param clen * @param brlen * @param bclen * @param pformat * @throws IOException * @throws DMLRuntimeException * @throws DMLUnsupportedOperationException */ @SuppressWarnings("deprecation") public void writePartitionedBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen, int brlen, int bclen, PDataPartitionFormat pformat) throws IOException, DMLRuntimeException, DMLUnsupportedOperationException { boolean sparse = src.isInSparseFormat(); FileSystem fs = FileSystem.get(job); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //initialize blocks for reuse (at most 4 different blocks required) MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse, src.getNonZeros()); switch (pformat) { case ROW_BLOCK_WISE_N: { long numBlocks = ((rlen - 1) / brlen) + 1; long numPartBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / clen / brlen); int count = 0; for (int k = 0; k < numBlocks; k += numPartBlocks) { // 1) create sequence file writer, with right replication factor // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication()) Path path2 = new Path(path.toString() + File.separator + (++count)); SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path2, MatrixIndexes.class, MatrixBlock.class); //3) reblock and write try { MatrixIndexes indexes = new MatrixIndexes(); //create and write subblocks of matrix for (int blockRow = k; blockRow < Math.min((int) Math.ceil(src.getNumRows() / (double) brlen), k + numPartBlocks); blockRow++) for (int blockCol = 0; blockCol < (int) Math .ceil(src.getNumColumns() / (double) bclen); blockCol++) { int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen : src.getNumRows() - blockRow * brlen; int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen : src.getNumColumns() - blockCol * bclen; int row_offset = blockRow * brlen; int col_offset = blockCol * bclen; //get reuse matrix block MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen); //copy submatrix to block src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset, col_offset + maxCol - 1, block); //append block to sequence file indexes.setIndexes(blockRow + 1, blockCol + 1); writer.append(indexes, block); //reset block for later reuse block.reset(); } } finally { IOUtilFunctions.closeSilently(writer); } } break; } case COLUMN_BLOCK_WISE_N: { long numBlocks = ((clen - 1) / bclen) + 1; long numPartBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / rlen / bclen); int count = 0; for (int k = 0; k < numBlocks; k += numPartBlocks) { // 1) create sequence file writer, with right replication factor // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication()) Path path2 = new Path(path.toString() + File.separator + (++count)); SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path2, MatrixIndexes.class, MatrixBlock.class); //3) reblock and write try { MatrixIndexes indexes = new MatrixIndexes(); //create and write subblocks of matrix for (int blockRow = 0; blockRow < (int) Math .ceil(src.getNumRows() / (double) brlen); blockRow++) for (int blockCol = k; blockCol < Math.min( (int) Math.ceil(src.getNumColumns() / (double) bclen), k + numPartBlocks); blockCol++) { int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen : src.getNumRows() - blockRow * brlen; int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen : src.getNumColumns() - blockCol * bclen; int row_offset = blockRow * brlen; int col_offset = blockCol * bclen; //get reuse matrix block MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen); //copy submatrix to block src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset, col_offset + maxCol - 1, block); //append block to sequence file indexes.setIndexes(blockRow + 1, blockCol + 1); writer.append(indexes, block); //reset block for later reuse block.reset(); } } finally { IOUtilFunctions.closeSilently(writer); } } break; } default: throw new DMLRuntimeException("Unsupported partition format for distributed cache input: " + pformat); } }
From source file:com.ibm.bi.dml.runtime.io.WriterBinaryBlockParallel.java
License:Open Source License
/** * /* ww w . j a va2s . c o m*/ * @param path * @param job * @param src * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException * @throws DMLUnsupportedOperationException * @throws DMLRuntimeException */ @Override protected void writeBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen, int brlen, int bclen, int replication) throws IOException, DMLRuntimeException, DMLUnsupportedOperationException { //estimate output size and number of output blocks (min 1) int numPartFiles = (int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen, src.getNonZeros()) / InfrastructureAnalyzer.getHDFSBlockSize()); numPartFiles = Math.max(numPartFiles, 1); //determine degree of parallelism int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism(); numThreads = Math.min(numThreads, numPartFiles); //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file if (numThreads <= 1) { super.writeBinaryBlockMatrixToHDFS(path, job, src, rlen, clen, brlen, bclen, replication); return; } //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //create directory for concurrent tasks MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION); FileSystem fs = FileSystem.get(job); //create and execute write tasks try { ExecutorService pool = Executors.newFixedThreadPool(numThreads); ArrayList<WriteFileTask> tasks = new ArrayList<WriteFileTask>(); int blklen = (int) Math.ceil((double) rlen / brlen / numThreads) * brlen; for (int i = 0; i < numThreads & i * blklen < rlen; i++) { Path newPath = new Path(path, String.format("0-m-%05d", i)); tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, rlen), brlen, bclen, _replication)); } //wait until all tasks have been executed List<Future<Object>> rt = pool.invokeAll(tasks); pool.shutdown(); //check for exceptions for (Future<Object> task : rt) task.get(); } catch (Exception e) { throw new IOException("Failed parallel write of binary block input.", e); } }
From source file:com.ibm.bi.dml.runtime.io.WriterMatrixMarket.java
License:Open Source License
/** * //w w w . jav a 2s . com * @param srcFileName * @param fileName * @param rlen * @param clen * @param nnz * @throws IOException */ public void mergeTextcellToMatrixMarket(String srcFileName, String fileName, long rlen, long clen, long nnz) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path src = new Path(srcFileName); Path merge = new Path(fileName); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(merge)) { hdfs.delete(merge, true); } OutputStream out = hdfs.create(merge, true); // write out the header first StringBuilder sb = new StringBuilder(); sb.append("%%MatrixMarket matrix coordinate real general\n"); // output number of rows, number of columns and number of nnz sb.append(rlen + " " + clen + " " + nnz + "\n"); out.write(sb.toString().getBytes()); // if the source is a directory if (hdfs.getFileStatus(src).isDirectory()) { try { FileStatus[] contents = hdfs.listStatus(src); for (int i = 0; i < contents.length; i++) { if (!contents[i].isDirectory()) { InputStream in = hdfs.open(contents[i].getPath()); try { IOUtils.copyBytes(in, out, conf, false); } finally { IOUtilFunctions.closeSilently(in); } } } } finally { IOUtilFunctions.closeSilently(out); } } else if (hdfs.isFile(src)) { InputStream in = null; try { in = hdfs.open(src); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(src.toString() + ": No such file or directory"); } }
From source file:com.ibm.bi.dml.runtime.io.WriterMatrixMarketParallel.java
License:Open Source License
/** * /*ww w . j a v a 2 s. com*/ * @param fileName * @param src * @param rlen * @param clen * @param nnz * @throws IOException */ @Override protected void writeMatrixMarketMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen, long nnz) throws IOException { //estimate output size and number of output blocks (min 1) int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(), src.getNonZeros(), OutputInfo.MatrixMarketOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize()); numPartFiles = Math.max(numPartFiles, 1); //determine degree of parallelism int numThreads = OptimizerUtils.getParallelTextWriteParallelism(); numThreads = Math.min(numThreads, numPartFiles); //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file if (numThreads <= 1) { super.writeMatrixMarketMatrixToHDFS(path, job, src, rlen, clen, nnz); return; } //create directory for concurrent tasks MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION); //create and execute tasks try { ExecutorService pool = Executors.newFixedThreadPool(numThreads); ArrayList<WriteMMTask> tasks = new ArrayList<WriteMMTask>(); int blklen = (int) Math.ceil((double) rlen / numThreads); for (int i = 0; i < numThreads & i * blklen < rlen; i++) { Path newPath = new Path(path, String.format("0-m-%05d", i)); tasks.add(new WriteMMTask(newPath, job, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen))); } //wait until all tasks have been executed List<Future<Object>> rt = pool.invokeAll(tasks); pool.shutdown(); //check for exceptions for (Future<Object> task : rt) task.get(); } catch (Exception e) { throw new IOException("Failed parallel write of text output.", e); } }
From source file:com.ibm.bi.dml.runtime.io.WriterTextCellParallel.java
License:Open Source License
/** * //ww w . j ava 2 s . co m * @param path * @param job * @param src * @param rlen * @param clen * @param brlen * @param bclen * @throws IOException */ @Override protected void writeTextCellMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen) throws IOException { //estimate output size and number of output blocks (min 1) int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(), src.getNonZeros(), OutputInfo.TextCellOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize()); numPartFiles = Math.max(numPartFiles, 1); //determine degree of parallelism int numThreads = OptimizerUtils.getParallelTextWriteParallelism(); numThreads = Math.min(numThreads, numPartFiles); //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file if (numThreads <= 1) { super.writeTextCellMatrixToHDFS(path, job, src, rlen, clen); return; } //create directory for concurrent tasks MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION); //create and execute tasks try { ExecutorService pool = Executors.newFixedThreadPool(numThreads); ArrayList<WriteTextTask> tasks = new ArrayList<WriteTextTask>(); int blklen = (int) Math.ceil((double) rlen / numThreads); for (int i = 0; i < numThreads & i * blklen < rlen; i++) { Path newPath = new Path(path, String.format("0-m-%05d", i)); tasks.add(new WriteTextTask(newPath, job, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen))); } //wait until all tasks have been executed List<Future<Object>> rt = pool.invokeAll(tasks); pool.shutdown(); //check for exceptions for (Future<Object> task : rt) task.get(); } catch (Exception e) { throw new IOException("Failed parallel write of text output.", e); } }
From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java
License:Open Source License
/** * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. * The part files are created by CSV_WRITE MR job. * /* ww w .j av a2s. c o m*/ * This method is invoked from CP-write instruction. * * @param srcFileName * @param destFileName * @param csvprop * @param rlen * @param clen * @throws IOException */ public void mergeCSVPartFiles(String srcFileName, String destFileName, CSVFileFormatProperties csvprop, long rlen, long clen) throws IOException { Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf()); Path srcFilePath = new Path(srcFileName); Path mergedFilePath = new Path(destFileName); FileSystem hdfs = FileSystem.get(conf); if (hdfs.exists(mergedFilePath)) { hdfs.delete(mergedFilePath, true); } OutputStream out = hdfs.create(mergedFilePath, true); // write out the header, if needed if (csvprop.hasHeader()) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < clen; i++) { sb.append("C" + (i + 1)); if (i < clen - 1) sb.append(csvprop.getDelim()); } sb.append('\n'); out.write(sb.toString().getBytes()); sb.setLength(0); } // if the source is a directory if (hdfs.isDirectory(srcFilePath)) { try { FileStatus[] contents = hdfs.listStatus(srcFilePath); Path[] partPaths = new Path[contents.length]; int numPartFiles = 0; for (int i = 0; i < contents.length; i++) { if (!contents[i].isDirectory()) { partPaths[i] = contents[i].getPath(); numPartFiles++; } } Arrays.sort(partPaths); for (int i = 0; i < numPartFiles; i++) { InputStream in = hdfs.open(partPaths[i]); try { IOUtils.copyBytes(in, out, conf, false); if (i < numPartFiles - 1) out.write('\n'); } finally { IOUtilFunctions.closeSilently(in); } } } finally { IOUtilFunctions.closeSilently(out); } } else if (hdfs.isFile(srcFilePath)) { InputStream in = null; try { in = hdfs.open(srcFilePath); IOUtils.copyBytes(in, out, conf, true); } finally { IOUtilFunctions.closeSilently(in); IOUtilFunctions.closeSilently(out); } } else { throw new IOException(srcFilePath.toString() + ": No such file or directory"); } }