Example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize

List of usage examples for org.apache.hadoop.fs FileSystem getDefaultBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize.

Prototype

@Deprecated
public long getDefaultBlockSize() 

Source Link

Document

Return the number of bytes that large input files should be optimally be split into to minimize I/O time.

Usage

From source file:org.apache.sysml.runtime.io.WriterBinaryBlock.java

License:Apache License

@SuppressWarnings("deprecation")
protected final void writeBinaryBlockMatrixToSequenceFile(Path path, JobConf job, FileSystem fs,
        MatrixBlock src, int brlen, int bclen, int rl, int ru) throws DMLRuntimeException, IOException {
    boolean sparse = src.isInSparseFormat();
    int rlen = src.getNumRows();
    int clen = src.getNumColumns();

    // 1) create sequence file writer, with right replication factor 
    // (config via MRConfigurationNames.DFS_REPLICATION not possible since sequence file internally calls fs.getDefaultReplication())
    SequenceFile.Writer writer = null;
    if (_replication > 0) //if replication specified (otherwise default)
    {/*from  w w  w .  j  ava 2s .  co m*/
        //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class,
                job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096), (short) _replication,
                fs.getDefaultBlockSize(), null, new SequenceFile.Metadata());
    } else {
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class);
    }

    try {
        // 2) bound check for src block
        if (src.getNumRows() > rlen || src.getNumColumns() > clen) {
            throw new IOException("Matrix block [1:" + src.getNumRows() + ",1:" + src.getNumColumns() + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        }

        //3) reblock and write
        MatrixIndexes indexes = new MatrixIndexes();

        if (rlen <= brlen && clen <= bclen && rl == 0) //opt for single block
        {
            //directly write single block
            indexes.setIndexes(1, 1);
            writer.append(indexes, src);
        } else //general case
        {
            //initialize blocks for reuse (at most 4 different blocks required)
            MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse,
                    src.getNonZeros());

            //create and write subblocks of matrix
            for (int blockRow = rl / brlen; blockRow < (int) Math.ceil(ru / (double) brlen); blockRow++)
                for (int blockCol = 0; blockCol < (int) Math
                        .ceil(src.getNumColumns() / (double) bclen); blockCol++) {
                    int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen
                            : src.getNumRows() - blockRow * brlen;
                    int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen
                            : src.getNumColumns() - blockCol * bclen;

                    int row_offset = blockRow * brlen;
                    int col_offset = blockCol * bclen;

                    //get reuse matrix block
                    MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                    //copy submatrix to block
                    src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset,
                            col_offset + maxCol - 1, block);

                    //append block to sequence file
                    indexes.setIndexes(blockRow + 1, blockCol + 1);
                    writer.append(indexes, block);

                    //reset block for later reuse
                    block.reset();
                }
        }
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
}

From source file:org.apache.sysml.runtime.io.WriterBinaryBlock.java

License:Apache License

@SuppressWarnings("deprecation")
protected final void writeDiagBinaryBlockMatrixToHDFS(Path path, JobConf job, FileSystem fs, MatrixBlock src,
        long rlen, long clen, int brlen, int bclen) throws IOException, DMLRuntimeException {
    boolean sparse = src.isInSparseFormat();

    // 1) create sequence file writer, with right replication factor 
    // (config via MRConfigurationNames.DFS_REPLICATION not possible since sequence file internally calls fs.getDefaultReplication())
    SequenceFile.Writer writer = null;
    if (_replication > 0) //if replication specified (otherwise default)
    {/*from w  ww .ja  v  a2s . c  o  m*/
        //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class,
                job.getInt(MRConfigurationNames.IO_FILE_BUFFER_SIZE, 4096), (short) _replication,
                fs.getDefaultBlockSize(), null, new SequenceFile.Metadata());
    } else {
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class);
    }

    try {
        // 2) bound check for src block
        if (src.getNumRows() > rlen || src.getNumColumns() > clen) {
            throw new IOException("Matrix block [1:" + src.getNumRows() + ",1:" + src.getNumColumns() + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        }

        //3) reblock and write
        MatrixIndexes indexes = new MatrixIndexes();

        if (rlen <= brlen && clen <= bclen) //opt for single block
        {
            //directly write single block
            indexes.setIndexes(1, 1);
            writer.append(indexes, src);
        } else //general case
        {
            //initialize blocks for reuse (at most 4 different blocks required)
            MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse,
                    src.getNonZeros());
            MatrixBlock emptyBlock = new MatrixBlock();

            //create and write subblocks of matrix
            for (int blockRow = 0; blockRow < (int) Math.ceil(src.getNumRows() / (double) brlen); blockRow++)
                for (int blockCol = 0; blockCol < (int) Math
                        .ceil(src.getNumColumns() / (double) bclen); blockCol++) {
                    int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen
                            : src.getNumRows() - blockRow * brlen;
                    int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen
                            : src.getNumColumns() - blockCol * bclen;
                    MatrixBlock block = null;

                    if (blockRow == blockCol) //block on diagonal
                    {
                        int row_offset = blockRow * brlen;
                        int col_offset = blockCol * bclen;

                        //get reuse matrix block
                        block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                        //copy submatrix to block
                        src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset,
                                col_offset + maxCol - 1, block);
                    } else //empty block (not on diagonal)
                    {
                        block = emptyBlock;
                        block.reset(maxRow, maxCol);
                    }

                    //append block to sequence file
                    indexes.setIndexes(blockRow + 1, blockCol + 1);
                    writer.append(indexes, block);

                    //reset block for later reuse
                    if (blockRow != blockCol)
                        block.reset();
                }
        }
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
}

From source file:org.apache.tajo.storage.AbstractStorageManager.java

License:Apache License

public FileFragment[] split(Path tablePath) throws IOException {
    FileSystem fs = tablePath.getFileSystem(conf);
    return split(tablePath.getName(), tablePath, fs.getDefaultBlockSize());
}

From source file:org.apache.trevni.avro.AvroTrevniOutputFormat.java

License:Apache License

@Override
public RecordWriter<AvroWrapper<T>, NullWritable> getRecordWriter(FileSystem ignore, final JobConf job,
        final String name, Progressable prog) throws IOException {

    boolean isMapOnly = job.getNumReduceTasks() == 0;
    final Schema schema = isMapOnly ? AvroJob.getMapOutputSchema(job) : AvroJob.getOutputSchema(job);

    final ColumnFileMetaData meta = filterMetadata(job);

    final Path dir = FileOutputFormat.getTaskOutputPath(job, name);
    final FileSystem fs = dir.getFileSystem(job);
    if (!fs.mkdirs(dir))
        throw new IOException("Failed to create directory: " + dir);
    final long blockSize = fs.getDefaultBlockSize();

    return new RecordWriter<AvroWrapper<T>, NullWritable>() {
        private int part = 0;

        private AvroColumnWriter<T> writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get());

        private void flush() throws IOException {
            OutputStream out = fs.create(new Path(dir, "part-" + (part++) + EXT));
            try {
                writer.writeTo(out);//from  w w  w . j a v  a  2s.  c o m
            } finally {
                out.close();
            }
            writer = new AvroColumnWriter<T>(schema, meta, ReflectData.get());
        }

        public void write(AvroWrapper<T> wrapper, NullWritable ignore) throws IOException {
            writer.write(wrapper.datum());
            if (writer.sizeEstimate() >= blockSize) // block full
                flush();
        }

        public void close(Reporter reporter) throws IOException {
            flush();
        }
    };
}

From source file:org.qcri.pca.SPCADriver.java

/**
 * Run sPCA//  ww w . j ava  2 s.  c o m
 * 
 * @param conf
 *          the configuration
 * @param input
 *          the path to the input matrix Y
 * @param output
 *          the path to the output (currently for normalization output)
 * @param nRows
 *          number of rows in input matrix
 * @param nCols
 *          number of columns in input matrix
 * @param nPCs
 *          number of desired principal components
 * @param splitFactor
 *          divide the block size by this number to increase parallelism
 * @param round
 *          the initial round index, used for naming each round output
 * @param LAST_ROUND
 *          the index of the last round
 * @param sampleRate
 *          if < 1, the input is sampled during normalization
 * @return the error
 * @throws Exception
 */
double runMapReduce(Configuration conf, DistributedRowMatrix distY, InitialValues initVal, Path output,
        final int nRows, final int nCols, final int nPCs, final int splitFactor, final float errSampleRate,
        final int LAST_ROUND, final int normalize) throws Exception {
    int round = 0;
    //The two PPCA variables that improve over each iteration
    double ss = initVal.ss;
    Matrix centralC = initVal.C;
    //initial CtC
    Matrix centralCtC = centralC.transpose().times(centralC);
    final float threshold = 0.00001f;
    int sampleRate = 1;
    //1. compute mean and span
    DenseVector ym = new DenseVector(distY.numCols()); //ym=mean(distY)
    MeanAndSpanJob masJob = new MeanAndSpanJob();
    boolean normalizeMean = false;
    if (normalize == 1)
        normalizeMean = true;
    Path meanSpanPath = masJob.compuateMeanAndSpan(distY.getRowPath(), output, ym, normalizeMean, conf,
            "" + round + "-init");
    Path normalizedYPath = null;

    //2. normalize the input matrix Y
    if (normalize == 1) {

        NormalizeJob normalizeJob = new NormalizeJob();
        normalizedYPath = normalizeJob.normalize(conf, distY.getRowPath(), meanSpanPath, output, sampleRate,
                "" + round + "-init");
        distY = new DistributedRowMatrix(normalizedYPath, getTempPath(), nRows, nCols);
        distY.setConf(conf);
        //After normalization, set the split factor
        if (splitFactor > 1) {
            FileSystem fss = FileSystem.get(normalizedYPath.toUri(), conf);
            long blockSize = fss.getDefaultBlockSize() / splitFactor;
            conf.set("mapred.max.split.size", Long.toString(blockSize));
        }
    }
    if (normalizedYPath == null)
        normalizedYPath = distY.getRowPath();

    //3. compute the 2-norm of Y
    Norm2Job normJob = new Norm2Job();
    double norm2 = normJob.computeFNorm(conf, normalizedYPath, meanSpanPath, getTempPath(),
            "" + round + "-init");
    if (sampleRate < 1) { // rescale
        norm2 = norm2 / sampleRate;
    }

    DenseVector xm = new DenseVector(nPCs);
    log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss);
    DistributedRowMatrix distY2X = null;
    DistributedRowMatrix distC = null;
    double prevObjective = Double.MAX_VALUE;
    double error = 0;
    double relChangeInObjective = Double.MAX_VALUE;
    double prevError = Double.MAX_VALUE;
    for (; (round < LAST_ROUND && relChangeInObjective > threshold && prevError > 0.02); round++) {
        // Sx = inv( ss * eye(d) + CtC );
        Matrix centralSx = centralCtC.clone();
        centralSx.viewDiagonal().assign(Functions.plus(ss));
        centralSx = inv(centralSx);
        // X = Y * C * Sx' => Y2X = C * Sx'
        Matrix centralY2X = centralC.times(centralSx.transpose());
        distY2X = PCACommon.toDistributedRowMatrix(centralY2X, getTempPath(), getTempPath(), "CSxt" + round);
        // Xm = Ym * Y2X
        PCACommon.denseVectorTimesMatrix(ym, centralY2X, xm);
        // We skip computing X as we generate it on demand using Y and Y2X

        //Compute X'X and Y'X 
        CompositeJob compositeJob = new CompositeJob();
        compositeJob.computeYtXandXtX(distY, distY2X, ym, xm, getTempPath(), conf, "" + round);
        Matrix centralXtX = compositeJob.xtx;
        Matrix centralYtX = compositeJob.ytx;
        if (sampleRate < 1) { // rescale
            centralXtX.assign(Functions.div(sampleRate));
            centralYtX.assign(Functions.div(sampleRate));
        }

        // XtX = X'*X + ss * Sx
        final double finalss = ss;
        centralXtX.assign(centralSx, new DoubleDoubleFunction() {
            @Override
            public double apply(double arg1, double arg2) {
                return arg1 + finalss * arg2;
            }
        });
        // C = (Ye'*X) / SumXtX;
        Matrix invXtX_central = inv(centralXtX);
        centralC = centralYtX.times(invXtX_central);
        distC = PCACommon.toDistributedRowMatrix(centralC, getTempPath(), getTempPath(), "C" + round);
        centralCtC = centralC.transpose().times(centralC);

        // Compute new value for ss
        // ss = ( sum(sum(Ye.^2)) + PCACommon.trace(XtX*CtC) - 2sum(XiCtYit) )
        // /(N*D);
        double ss2 = PCACommon.trace(centralXtX.times(centralCtC));
        VarianceJob varianceJob = new VarianceJob();
        double xctyt = varianceJob.computeVariance(distY, ym, distY2X, xm, distC, getTempPath(), conf,
                "" + round);
        if (sampleRate < 1) { // rescale
            xctyt = xctyt / sampleRate;
        }
        ss = (norm2 + ss2 - 2 * xctyt) / (nRows * nCols);
        log.info("SSSSSSSSSSSSSSSSSSSSSSSSSSSS " + ss + " (" + norm2 + " + " + ss2 + " -2* " + xctyt);
        double traceSx = PCACommon.trace(centralSx);
        double traceXtX = PCACommon.trace(centralXtX);
        double traceC = PCACommon.trace(centralC);
        double traceCtC = PCACommon.trace(centralCtC);
        log.info("TTTTTTTTTTTTTTTTT " + traceSx + " " + traceXtX + " " + traceC + " " + traceCtC);

        double objective = ss;
        relChangeInObjective = Math.abs(1 - objective / prevObjective);
        prevObjective = objective;
        log.info("Objective:  %.6f    relative change: %.6f \n", objective, relChangeInObjective);
        if (!CALCULATE_ERR_ATTHEEND) {
            log.info("Computing the error at round " + round + " ...");
            ReconstructionErrJob errJob = new ReconstructionErrJob();
            error = errJob.reconstructionErr(distY, distY2X, distC, centralC, ym, xm, errSampleRate, conf,
                    getTempPath(), "" + round);
            log.info("... end of computing the error at round " + round);
            prevError = error;
        }
    }

    if (CALCULATE_ERR_ATTHEEND) {
        log.info("Computing the error at round " + round + " ...");
        ReconstructionErrJob errJob = new ReconstructionErrJob();
        error = errJob.reconstructionErr(distY, distY2X, distC, centralC, ym, xm, errSampleRate, conf,
                getTempPath(), "" + round);
        log.info("... end of computing the error at round " + round);
    }

    initVal.C = centralC;
    initVal.ss = ss;
    writeMatrix(initVal.C, output, getTempPath(), "PCs");
    return error;

}