Example usage for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.ResultMergeRemoteMR.java

License:Open Source License

/**
 * //from www.j a  v a  2 s.  c om
 * @param fname    null if no comparison required
 * @param fnameNew
 * @param srcFnames
 * @param ii
 * @param oi
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws DMLRuntimeException
 */
@SuppressWarnings({ "unused", "deprecation" })
protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi,
        long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-RMMR";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;

    JobConf job;
    job = new JobConf(ResultMergeRemoteMR.class);
    job.setJobName(jobname + _pfid);

    //maintain dml script counters
    Statistics.incrementNoOfCompiledMRJobs();

    //warning for textcell/binarycell without compare
    boolean withCompare = (fname != null);
    if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare
            && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
        LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi)
                + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");

    try {
        Path pathCompare = null;
        Path pathNew = new Path(fnameNew);

        /////
        //configure the MR job
        if (withCompare) {
            pathCompare = new Path(fname).makeQualified(FileSystem.get(job));
            MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen,
                    bclen);
        } else
            MRJobConfiguration.setResultMergeInfo(job, "null", ii,
                    LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen,
                    bclen);

        //set mappers, reducers, combiners
        job.setMapperClass(ResultMergeRemoteMapper.class);
        job.setReducerClass(ResultMergeRemoteReducer.class);

        if (oi == OutputInfo.TextCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);
        } else if (oi == OutputInfo.BinaryCellOutputInfo) {
            job.setMapOutputKeyClass(MatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixCell.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixCell.class);
        } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
            //setup partitioning, grouping, sorting for composite key (old API)
            job.setPartitionerClass(ResultMergeRemotePartitioning.class); //partitioning
            job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class); //grouping
            job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class); //sorting

            job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
            job.setMapOutputValueClass(TaggedMatrixBlock.class);
            job.setOutputKeyClass(MatrixIndexes.class);
            job.setOutputValueClass(MatrixBlock.class);
        }

        //set input format 
        job.setInputFormat(ii.inputFormatClass);

        //set the input path 
        Path[] paths = null;
        if (withCompare) {
            paths = new Path[srcFnames.length + 1];
            paths[0] = pathCompare;
            for (int i = 1; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i - 1]);
        } else {
            paths = new Path[srcFnames.length];
            for (int i = 0; i < paths.length; i++)
                paths[i] = new Path(srcFnames[i]);
        }
        FileInputFormat.setInputPaths(job, paths);

        //set output format
        job.setOutputFormat(oi.outputFormatClass);

        //set output path
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
        FileOutputFormat.setOutputPath(job, pathNew);

        //////
        //set optimization parameters

        //set the number of mappers and reducers 
        //job.setNumMapTasks( _numMappers ); //use default num mappers
        long reducerGroups = _numReducers;
        if (oi == OutputInfo.BinaryBlockOutputInfo)
            reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
        else //textcell/binarycell
            reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
        job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));

        //use FLEX scheduler configuration properties
        if (ParForProgramBlock.USE_FLEX_SCHEDULER_CONF) {
            job.setInt("flex.map.min", 0);
            job.setInt("flex.map.max", _numMappers);
            job.setInt("flex.reduce.min", 0);
            job.setInt("flex.reduce.max", _numMappers);
        }

        //disable automatic tasks timeouts and speculative task exec
        job.setInt("mapred.task.timeout", 0);
        job.setMapSpeculativeExecution(false);

        //set up preferred custom serialization framework for binary block format
        if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
            MRJobConfiguration.addBinaryBlockSerializationFramework(job);

        //enables the reuse of JVMs (multiple tasks per MR task)
        if (_jvmReuse)
            job.setNumTasksToExecutePerJvm(-1); //unlimited

        //enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
        //job.set("mapred.compress.map.output", "true");
        //job.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");

        //set the replication factor for the results
        job.setInt("dfs.replication", _replication);

        //set the max number of retries per map task
        //  disabled job-level configuration to respect cluster configuration
        //  note: this refers to hadoop2, hence it never had effect on mr1
        //job.setInt("mapreduce.map.maxattempts", _max_retry);

        //set unique working dir
        MRJobConfiguration.setUniqueWorkingDir(job);

        /////
        // execute the MR job   

        JobClient.runJob(job);

        //maintain dml script counters
        Statistics.incrementNoOfExecutedMRJobs();
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }

    if (DMLScript.STATISTICS) {
        long t1 = System.nanoTime();
        Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
    }
}

From source file:com.ibm.bi.dml.runtime.io.MatrixReader.java

License:Open Source License

/**
 * /* ww w . j  ava 2 s. c  o  m*/
 * @param fs
 * @param path
 * @throws IOException 
 */
protected static void checkValidInputFile(FileSystem fs, Path path) throws IOException {
    //check non-existing file
    if (!fs.exists(path))
        throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");

    //check for empty file
    if (MapReduceTool.isFileEmpty(fs, path.toString()))
        throw new EOFException("Empty input file " + path.toString() + ".");

}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSV.java

License:Open Source License

/**
 * //from  ww w. ja va 2  s . c o  m
 * @param path
 * @param job
 * @param fs
 * @param dest
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param hasHeader
 * @param delim
 * @param fill
 * @param fillValue
 * @return
 * @throws IOException
 */
@SuppressWarnings("unchecked")
private MatrixBlock readCSVMatrixFromHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock dest, long rlen,
        long clen, int brlen, int bclen, boolean hasHeader, String delim, boolean fill, double fillValue)
        throws IOException {
    ArrayList<Path> files = new ArrayList<Path>();
    if (fs.isDirectory(path)) {
        for (FileStatus stat : fs.listStatus(path, CSVReblockMR.hiddenFileFilter))
            files.add(stat.getPath());
        Collections.sort(files);
    } else
        files.add(path);

    if (dest == null) {
        dest = computeCSVSize(files, job, fs, hasHeader, delim, fill, fillValue);
        clen = dest.getNumColumns();
    }

    boolean sparse = dest.isInSparseFormat();

    /////////////////////////////////////////
    String value = null;
    int row = 0;
    int col = -1;
    double cellValue = 0;
    long lnnz = 0;

    for (int fileNo = 0; fileNo < files.size(); fileNo++) {
        BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(files.get(fileNo))));
        if (fileNo == 0 && hasHeader)
            br.readLine(); //ignore header

        // Read the data
        boolean emptyValuesFound = false;
        try {
            if (sparse) //SPARSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.appendValue(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            } else //DENSE<-value
            {
                while ((value = br.readLine()) != null) //foreach line
                {
                    String cellStr = value.toString().trim();
                    emptyValuesFound = false;
                    String[] parts = IOUtilFunctions.split(cellStr, delim);
                    col = 0;

                    for (String part : parts) //foreach cell
                    {
                        part = part.trim();
                        if (part.isEmpty()) {
                            emptyValuesFound = true;
                            cellValue = fillValue;
                        } else {
                            cellValue = UtilFunctions.parseToDouble(part);
                        }
                        if (cellValue != 0) {
                            dest.setValueDenseUnsafe(row, col, cellValue);
                            lnnz++;
                        }
                        col++;
                    }

                    //sanity checks for empty values and number of columns
                    IOUtilFunctions.checkAndRaiseErrorCSVEmptyField(cellStr, fill, emptyValuesFound);
                    IOUtilFunctions.checkAndRaiseErrorCSVNumColumns(path.toString(), cellStr, parts, clen);
                    row++;
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(br);
        }
    }

    //post processing
    dest.setNonZeros(lnnz);

    return dest;
}

From source file:com.ibm.bi.dml.runtime.io.ReaderTextCSVParallel.java

License:Open Source License

@Override
public MatrixBlock readMatrixFromHDFS(String fname, long rlen, long clen, int brlen, int bclen, long estnnz)
        throws IOException, DMLRuntimeException {
    // prepare file access
    JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
    FileSystem fs = FileSystem.get(job);
    Path path = new Path(fname);

    FileInputFormat.addInputPath(job, path);
    TextInputFormat informat = new TextInputFormat();
    informat.configure(job);/*from  w  ww  .jav  a 2 s.c o m*/

    InputSplit[] splits = informat.getSplits(job, _numThreads);

    if (splits[0] instanceof FileSplit) {
        // The splits do not always arrive in order by file name.
        // Sort the splits lexicographically by path so that the header will
        // be in the first split.
        // Note that we're assuming that the splits come in order by offset
        Arrays.sort(splits, new Comparator<InputSplit>() {
            @Override
            public int compare(InputSplit o1, InputSplit o2) {
                Path p1 = ((FileSplit) o1).getPath();
                Path p2 = ((FileSplit) o2).getPath();
                return p1.toString().compareTo(p2.toString());
            }
        });
    }

    // check existence and non-empty file
    checkValidInputFile(fs, path);

    // allocate output matrix block
    // First Read Pass (count rows/cols, determine offsets, allocate matrix block)
    MatrixBlock ret = computeCSVSizeAndCreateOutputMatrixBlock(splits, path, job, _props.hasHeader(),
            _props.getDelim(), estnnz);
    rlen = ret.getNumRows();
    clen = ret.getNumColumns();

    // Second Read Pass (read, parse strings, append to matrix block)
    readCSVMatrixFromHDFS(splits, path, job, ret, rlen, clen, brlen, bclen, _props.hasHeader(),
            _props.getDelim(), _props.isFill(), _props.getFillValue());

    //post-processing (representation-specific, change of sparse/dense block representation)
    // - no sorting required for CSV because it is read in sorted order per row
    // - nnz explicitly maintained in parallel for the individual splits
    ret.examSparsity();

    // sanity check for parallel row count (since determined internally)
    if (rlen > 0 && rlen != ret.getNumRows())
        throw new DMLRuntimeException("Read matrix inconsistent with given meta data: " + "expected nrow="
                + rlen + ", real nrow=" + ret.getNumRows());

    return ret;
}

From source file:com.ibm.bi.dml.runtime.io.WriterBinaryBlock.java

License:Open Source License

/**
 * /*from w ww.jav  a2 s  .co m*/
 * @param path
 * @param job
 * @param src
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param pformat
 * @throws IOException
 * @throws DMLRuntimeException
 * @throws DMLUnsupportedOperationException
 */
@SuppressWarnings("deprecation")
public void writePartitionedBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen,
        long clen, int brlen, int bclen, PDataPartitionFormat pformat)
        throws IOException, DMLRuntimeException, DMLUnsupportedOperationException {
    boolean sparse = src.isInSparseFormat();
    FileSystem fs = FileSystem.get(job);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //initialize blocks for reuse (at most 4 different blocks required)
    MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse, src.getNonZeros());

    switch (pformat) {
    case ROW_BLOCK_WISE_N: {
        long numBlocks = ((rlen - 1) / brlen) + 1;
        long numPartBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / clen / brlen);

        int count = 0;
        for (int k = 0; k < numBlocks; k += numPartBlocks) {
            // 1) create sequence file writer, with right replication factor 
            // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication())
            Path path2 = new Path(path.toString() + File.separator + (++count));
            SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path2, MatrixIndexes.class,
                    MatrixBlock.class);

            //3) reblock and write
            try {
                MatrixIndexes indexes = new MatrixIndexes();

                //create and write subblocks of matrix
                for (int blockRow = k; blockRow < Math.min((int) Math.ceil(src.getNumRows() / (double) brlen),
                        k + numPartBlocks); blockRow++)
                    for (int blockCol = 0; blockCol < (int) Math
                            .ceil(src.getNumColumns() / (double) bclen); blockCol++) {
                        int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen
                                : src.getNumRows() - blockRow * brlen;
                        int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen
                                : src.getNumColumns() - blockCol * bclen;

                        int row_offset = blockRow * brlen;
                        int col_offset = blockCol * bclen;

                        //get reuse matrix block
                        MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                        //copy submatrix to block
                        src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset,
                                col_offset + maxCol - 1, block);

                        //append block to sequence file
                        indexes.setIndexes(blockRow + 1, blockCol + 1);
                        writer.append(indexes, block);

                        //reset block for later reuse
                        block.reset();
                    }
            } finally {
                IOUtilFunctions.closeSilently(writer);
            }
        }
        break;
    }
    case COLUMN_BLOCK_WISE_N: {
        long numBlocks = ((clen - 1) / bclen) + 1;
        long numPartBlocks = (long) Math.ceil(((double) DistributedCacheInput.PARTITION_SIZE) / rlen / bclen);

        int count = 0;
        for (int k = 0; k < numBlocks; k += numPartBlocks) {
            // 1) create sequence file writer, with right replication factor 
            // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication())
            Path path2 = new Path(path.toString() + File.separator + (++count));
            SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, path2, MatrixIndexes.class,
                    MatrixBlock.class);

            //3) reblock and write
            try {
                MatrixIndexes indexes = new MatrixIndexes();

                //create and write subblocks of matrix
                for (int blockRow = 0; blockRow < (int) Math
                        .ceil(src.getNumRows() / (double) brlen); blockRow++)
                    for (int blockCol = k; blockCol < Math.min(
                            (int) Math.ceil(src.getNumColumns() / (double) bclen),
                            k + numPartBlocks); blockCol++) {
                        int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen
                                : src.getNumRows() - blockRow * brlen;
                        int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen
                                : src.getNumColumns() - blockCol * bclen;

                        int row_offset = blockRow * brlen;
                        int col_offset = blockCol * bclen;

                        //get reuse matrix block
                        MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                        //copy submatrix to block
                        src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset,
                                col_offset + maxCol - 1, block);

                        //append block to sequence file
                        indexes.setIndexes(blockRow + 1, blockCol + 1);
                        writer.append(indexes, block);

                        //reset block for later reuse
                        block.reset();
                    }
            } finally {
                IOUtilFunctions.closeSilently(writer);
            }
        }
        break;
    }

    default:
        throw new DMLRuntimeException("Unsupported partition format for distributed cache input: " + pformat);
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterBinaryBlockParallel.java

License:Open Source License

/**
 * /* ww  w . j a va2s . c o  m*/
 * @param path
 * @param job
 * @param src
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws DMLUnsupportedOperationException 
 * @throws DMLRuntimeException 
 */
@Override
protected void writeBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen,
        int brlen, int bclen, int replication)
        throws IOException, DMLRuntimeException, DMLUnsupportedOperationException {
    //estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimatePartitionedSizeExactSparsity(rlen, clen, brlen, bclen,
            src.getNonZeros()) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);

    //determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelBinaryWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);

    //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeBinaryBlockMatrixToHDFS(path, job, src, rlen, clen, brlen, bclen, replication);
        return;
    }

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);
    FileSystem fs = FileSystem.get(job);

    //create and execute write tasks
    try {
        ExecutorService pool = Executors.newFixedThreadPool(numThreads);
        ArrayList<WriteFileTask> tasks = new ArrayList<WriteFileTask>();
        int blklen = (int) Math.ceil((double) rlen / brlen / numThreads) * brlen;
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, String.format("0-m-%05d", i));
            tasks.add(new WriteFileTask(newPath, job, fs, src, i * blklen, Math.min((i + 1) * blklen, rlen),
                    brlen, bclen, _replication));
        }

        //wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();

        //check for exceptions 
        for (Future<Object> task : rt)
            task.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel write of binary block input.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterMatrixMarket.java

License:Open Source License

/**
 * //w w  w  . jav a  2s .  com
 * @param srcFileName
 * @param fileName
 * @param rlen
 * @param clen
 * @param nnz
 * @throws IOException
 */
public void mergeTextcellToMatrixMarket(String srcFileName, String fileName, long rlen, long clen, long nnz)
        throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path src = new Path(srcFileName);
    Path merge = new Path(fileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(merge)) {
        hdfs.delete(merge, true);
    }

    OutputStream out = hdfs.create(merge, true);

    // write out the header first 
    StringBuilder sb = new StringBuilder();
    sb.append("%%MatrixMarket matrix coordinate real general\n");

    // output number of rows, number of columns and number of nnz
    sb.append(rlen + " " + clen + " " + nnz + "\n");
    out.write(sb.toString().getBytes());

    // if the source is a directory
    if (hdfs.getFileStatus(src).isDirectory()) {
        try {
            FileStatus[] contents = hdfs.listStatus(src);
            for (int i = 0; i < contents.length; i++) {
                if (!contents[i].isDirectory()) {
                    InputStream in = hdfs.open(contents[i].getPath());
                    try {
                        IOUtils.copyBytes(in, out, conf, false);
                    } finally {
                        IOUtilFunctions.closeSilently(in);
                    }
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(out);
        }
    } else if (hdfs.isFile(src)) {
        InputStream in = null;
        try {
            in = hdfs.open(src);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(src.toString() + ": No such file or directory");
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterMatrixMarketParallel.java

License:Open Source License

/**
 * /*ww  w  .  j a v a  2  s. com*/
 * @param fileName
 * @param src
 * @param rlen
 * @param clen
 * @param nnz
 * @throws IOException
 */
@Override
protected void writeMatrixMarketMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen,
        long nnz) throws IOException {
    //estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(),
            src.getNonZeros(), OutputInfo.MatrixMarketOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);

    //determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);

    //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeMatrixMarketMatrixToHDFS(path, job, src, rlen, clen, nnz);
        return;
    }

    //create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);

    //create and execute tasks
    try {
        ExecutorService pool = Executors.newFixedThreadPool(numThreads);
        ArrayList<WriteMMTask> tasks = new ArrayList<WriteMMTask>();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, String.format("0-m-%05d", i));
            tasks.add(new WriteMMTask(newPath, job, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen)));
        }

        //wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();

        //check for exceptions 
        for (Future<Object> task : rt)
            task.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel write of text output.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCellParallel.java

License:Open Source License

/**
 * //ww  w .  j ava 2  s . co m
 * @param path
 * @param job
 * @param src
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 */
@Override
protected void writeTextCellMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen)
        throws IOException {
    //estimate output size and number of output blocks (min 1)
    int numPartFiles = (int) (OptimizerUtils.estimateSizeTextOutput(src.getNumRows(), src.getNumColumns(),
            src.getNonZeros(), OutputInfo.TextCellOutputInfo) / InfrastructureAnalyzer.getHDFSBlockSize());
    numPartFiles = Math.max(numPartFiles, 1);

    //determine degree of parallelism
    int numThreads = OptimizerUtils.getParallelTextWriteParallelism();
    numThreads = Math.min(numThreads, numPartFiles);

    //fall back to sequential write if dop is 1 (e.g., <128MB) in order to create single file
    if (numThreads <= 1) {
        super.writeTextCellMatrixToHDFS(path, job, src, rlen, clen);
        return;
    }

    //create directory for concurrent tasks
    MapReduceTool.createDirIfNotExistOnHDFS(path.toString(), DMLConfig.DEFAULT_SHARED_DIR_PERMISSION);

    //create and execute tasks
    try {
        ExecutorService pool = Executors.newFixedThreadPool(numThreads);
        ArrayList<WriteTextTask> tasks = new ArrayList<WriteTextTask>();
        int blklen = (int) Math.ceil((double) rlen / numThreads);
        for (int i = 0; i < numThreads & i * blklen < rlen; i++) {
            Path newPath = new Path(path, String.format("0-m-%05d", i));
            tasks.add(new WriteTextTask(newPath, job, src, i * blklen, (int) Math.min((i + 1) * blklen, rlen)));
        }

        //wait until all tasks have been executed
        List<Future<Object>> rt = pool.invokeAll(tasks);
        pool.shutdown();

        //check for exceptions 
        for (Future<Object> task : rt)
            task.get();
    } catch (Exception e) {
        throw new IOException("Failed parallel write of text output.", e);
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterTextCSV.java

License:Open Source License

/**
 * Method to merge multiple CSV part files on HDFS into a single CSV file on HDFS. 
 * The part files are created by CSV_WRITE MR job. 
 * /* ww w  .j  av  a2s.  c  o  m*/
 * This method is invoked from CP-write instruction.
 * 
 * @param srcFileName
 * @param destFileName
 * @param csvprop
 * @param rlen
 * @param clen
 * @throws IOException
 */
public void mergeCSVPartFiles(String srcFileName, String destFileName, CSVFileFormatProperties csvprop,
        long rlen, long clen) throws IOException {
    Configuration conf = new Configuration(ConfigurationManager.getCachedJobConf());

    Path srcFilePath = new Path(srcFileName);
    Path mergedFilePath = new Path(destFileName);
    FileSystem hdfs = FileSystem.get(conf);

    if (hdfs.exists(mergedFilePath)) {
        hdfs.delete(mergedFilePath, true);
    }
    OutputStream out = hdfs.create(mergedFilePath, true);

    // write out the header, if needed
    if (csvprop.hasHeader()) {
        StringBuilder sb = new StringBuilder();
        for (int i = 0; i < clen; i++) {
            sb.append("C" + (i + 1));
            if (i < clen - 1)
                sb.append(csvprop.getDelim());
        }
        sb.append('\n');
        out.write(sb.toString().getBytes());
        sb.setLength(0);
    }

    // if the source is a directory
    if (hdfs.isDirectory(srcFilePath)) {
        try {
            FileStatus[] contents = hdfs.listStatus(srcFilePath);
            Path[] partPaths = new Path[contents.length];
            int numPartFiles = 0;
            for (int i = 0; i < contents.length; i++) {
                if (!contents[i].isDirectory()) {
                    partPaths[i] = contents[i].getPath();
                    numPartFiles++;
                }
            }
            Arrays.sort(partPaths);

            for (int i = 0; i < numPartFiles; i++) {
                InputStream in = hdfs.open(partPaths[i]);
                try {
                    IOUtils.copyBytes(in, out, conf, false);
                    if (i < numPartFiles - 1)
                        out.write('\n');
                } finally {
                    IOUtilFunctions.closeSilently(in);
                }
            }
        } finally {
            IOUtilFunctions.closeSilently(out);
        }
    } else if (hdfs.isFile(srcFilePath)) {
        InputStream in = null;
        try {
            in = hdfs.open(srcFilePath);
            IOUtils.copyBytes(in, out, conf, true);
        } finally {
            IOUtilFunctions.closeSilently(in);
            IOUtilFunctions.closeSilently(out);
        }
    } else {
        throw new IOException(srcFilePath.toString() + ": No such file or directory");
    }
}