Example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize

List of usage examples for org.apache.hadoop.fs FileSystem getDefaultBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize.

Prototype

@Deprecated
public long getDefaultBlockSize() 

Source Link

Document

Return the number of bytes that large input files should be optimally be split into to minimize I/O time.

Usage

From source file:com.ibm.bi.dml.runtime.io.WriterBinaryBlock.java

License:Open Source License

/**
 * /*from   w  ww .j av a2  s  . c  om*/
 * @param path
 * @param job
 * @param src
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @throws IOException
 * @throws DMLUnsupportedOperationException 
 * @throws DMLRuntimeException 
 */
@SuppressWarnings("deprecation")
protected void writeBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen,
        int brlen, int bclen, int replication)
        throws IOException, DMLRuntimeException, DMLUnsupportedOperationException {
    boolean sparse = src.isInSparseFormat();
    FileSystem fs = FileSystem.get(job);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    // 1) create sequence file writer, with right replication factor 
    // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication())
    SequenceFile.Writer writer = null;
    if (replication > 0) //if replication specified (otherwise default)
    {
        //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class,
                job.getInt("io.file.buffer.size", 4096), (short) replication, fs.getDefaultBlockSize(), null,
                new SequenceFile.Metadata());
    } else {
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class);
    }

    try {
        // 2) bound check for src block
        if (src.getNumRows() > rlen || src.getNumColumns() > clen) {
            throw new IOException("Matrix block [1:" + src.getNumRows() + ",1:" + src.getNumColumns() + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        }

        //3) reblock and write
        MatrixIndexes indexes = new MatrixIndexes();

        if (rlen <= brlen && clen <= bclen) //opt for single block
        {
            //directly write single block
            indexes.setIndexes(1, 1);
            writer.append(indexes, src);
        } else //general case
        {
            //initialize blocks for reuse (at most 4 different blocks required)
            MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse,
                    src.getNonZeros());

            //create and write subblocks of matrix
            for (int blockRow = 0; blockRow < (int) Math.ceil(src.getNumRows() / (double) brlen); blockRow++)
                for (int blockCol = 0; blockCol < (int) Math
                        .ceil(src.getNumColumns() / (double) bclen); blockCol++) {
                    int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen
                            : src.getNumRows() - blockRow * brlen;
                    int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen
                            : src.getNumColumns() - blockCol * bclen;

                    int row_offset = blockRow * brlen;
                    int col_offset = blockCol * bclen;

                    //get reuse matrix block
                    MatrixBlock block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                    //copy submatrix to block
                    src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset,
                            col_offset + maxCol - 1, block);

                    //append block to sequence file
                    indexes.setIndexes(blockRow + 1, blockCol + 1);
                    writer.append(indexes, block);

                    //reset block for later reuse
                    block.reset();
                }
        }
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
}

From source file:com.ibm.bi.dml.runtime.io.WriterBinaryBlock.java

License:Open Source License

/**
 * /*from w w  w .j ava2 s  . co m*/
 * @param path
 * @param job
 * @param src
 * @param rlen
 * @param clen
 * @param brlen
 * @param bclen
 * @param replication
 * @throws IOException
 * @throws DMLUnsupportedOperationException 
 * @throws DMLRuntimeException 
 */
@SuppressWarnings("deprecation")
protected void writeDiagBinaryBlockMatrixToHDFS(Path path, JobConf job, MatrixBlock src, long rlen, long clen,
        int brlen, int bclen, int replication)
        throws IOException, DMLRuntimeException, DMLUnsupportedOperationException {
    boolean sparse = src.isInSparseFormat();
    FileSystem fs = FileSystem.get(job);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    // 1) create sequence file writer, with right replication factor 
    // (config via 'dfs.replication' not possible since sequence file internally calls fs.getDefaultReplication())
    SequenceFile.Writer writer = null;
    if (replication > 0) //if replication specified (otherwise default)
    {
        //copy of SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class), except for replication
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class,
                job.getInt("io.file.buffer.size", 4096), (short) replication, fs.getDefaultBlockSize(), null,
                new SequenceFile.Metadata());
    } else {
        writer = new SequenceFile.Writer(fs, job, path, MatrixIndexes.class, MatrixBlock.class);
    }

    try {
        // 2) bound check for src block
        if (src.getNumRows() > rlen || src.getNumColumns() > clen) {
            throw new IOException("Matrix block [1:" + src.getNumRows() + ",1:" + src.getNumColumns() + "] "
                    + "out of overall matrix range [1:" + rlen + ",1:" + clen + "].");
        }

        //3) reblock and write
        MatrixIndexes indexes = new MatrixIndexes();

        if (rlen <= brlen && clen <= bclen) //opt for single block
        {
            //directly write single block
            indexes.setIndexes(1, 1);
            writer.append(indexes, src);
        } else //general case
        {
            //initialize blocks for reuse (at most 4 different blocks required)
            MatrixBlock[] blocks = createMatrixBlocksForReuse(rlen, clen, brlen, bclen, sparse,
                    src.getNonZeros());
            MatrixBlock emptyBlock = new MatrixBlock();

            //create and write subblocks of matrix
            for (int blockRow = 0; blockRow < (int) Math.ceil(src.getNumRows() / (double) brlen); blockRow++)
                for (int blockCol = 0; blockCol < (int) Math
                        .ceil(src.getNumColumns() / (double) bclen); blockCol++) {
                    int maxRow = (blockRow * brlen + brlen < src.getNumRows()) ? brlen
                            : src.getNumRows() - blockRow * brlen;
                    int maxCol = (blockCol * bclen + bclen < src.getNumColumns()) ? bclen
                            : src.getNumColumns() - blockCol * bclen;
                    MatrixBlock block = null;

                    if (blockRow == blockCol) //block on diagonal
                    {
                        int row_offset = blockRow * brlen;
                        int col_offset = blockCol * bclen;

                        //get reuse matrix block
                        block = getMatrixBlockForReuse(blocks, maxRow, maxCol, brlen, bclen);

                        //copy submatrix to block
                        src.sliceOperations(row_offset, row_offset + maxRow - 1, col_offset,
                                col_offset + maxCol - 1, block);
                    } else //empty block (not on diagonal)
                    {
                        block = emptyBlock;
                        block.reset(maxRow, maxCol);
                    }

                    //append block to sequence file
                    indexes.setIndexes(blockRow + 1, blockCol + 1);
                    writer.append(indexes, block);

                    //reset block for later reuse
                    if (blockRow != blockCol)
                        block.reset();
                }
        }
    } finally {
        IOUtilFunctions.closeSilently(writer);
    }
}

From source file:com.inmobi.conduit.distcp.tools.mapred.RetriableFileCopyCommand.java

License:Apache License

private static long getBlockSize(EnumSet<FileAttribute> fileAttributes, FileStatus sourceFile,
        FileSystem targetFS) {
    return fileAttributes.contains(FileAttribute.BLOCKSIZE) ? sourceFile.getBlockSize()
            : targetFS.getDefaultBlockSize();
}

From source file:com.inmobi.conduit.distcp.tools.mapred.TestCopyMapper.java

License:Apache License

private static void touchFile(String path) throws Exception {
    FileSystem fs;
    DataOutputStream outputStream = null;
    GzipCodec gzipCodec = ReflectionUtils.newInstance(GzipCodec.class, getConfiguration());
    Compressor gzipCompressor = CodecPool.getCompressor(gzipCodec);
    OutputStream compressedOut = null;
    try {/*  ww  w  .  j a  va2  s. c o m*/
        fs = cluster.getFileSystem();
        final Path qualifiedPath = new Path(path).makeQualified(fs);
        final long blockSize = fs.getDefaultBlockSize() * 2;
        outputStream = fs.create(qualifiedPath, true, 0, (short) (fs.getDefaultReplication() * 2), blockSize);
        compressedOut = gzipCodec.createOutputStream(outputStream, gzipCompressor);
        Message msg = new Message("generating test data".getBytes());
        AuditUtil.attachHeaders(msg, currentTimestamp);
        byte[] encodeMsg = Base64.encodeBase64(msg.getData().array());
        compressedOut.write(encodeMsg);
        compressedOut.write("\n".getBytes());
        compressedOut.write(encodeMsg);
        compressedOut.write("\n".getBytes());
        // Genearate a msg with different timestamp.  Default window period is 60sec
        AuditUtil.attachHeaders(msg, nextMinuteTimeStamp);
        encodeMsg = Base64.encodeBase64(msg.getData().array());
        compressedOut.write(encodeMsg);
        compressedOut.write("\n".getBytes());
        compressedOut.flush();
        compressedOut.close();
        pathList.add(qualifiedPath);
        ++nFiles;

        FileStatus fileStatus = fs.getFileStatus(qualifiedPath);
        System.out.println(fileStatus.getBlockSize());
        System.out.println(fileStatus.getReplication());
    } finally {
        compressedOut.close();
        IOUtils.cleanup(null, outputStream);
        CodecPool.returnCompressor(gzipCompressor);
    }
}

From source file:com.inmobi.conduit.distcp.tools.TestDistCp.java

License:Apache License

private static void touchFile(String path) throws Exception {
    FileSystem fs;
    DataOutputStream outputStream = null;
    GzipCodec gzipCodec = ReflectionUtils.newInstance(GzipCodec.class, getConfigurationForCluster());
    Compressor gzipCompressor = CodecPool.getCompressor(gzipCodec);
    OutputStream compressedOut = null;
    try {//w w  w  . ja v  a  2 s  . c o  m
        fs = cluster.getFileSystem();
        final Path qualifiedPath = new Path(path).makeQualified(fs);
        final long blockSize = fs.getDefaultBlockSize() * 2;
        outputStream = fs.create(qualifiedPath, true, 0, (short) (fs.getDefaultReplication() * 2), blockSize);
        compressedOut = gzipCodec.createOutputStream(outputStream, gzipCompressor);
        compressedOut.write(new byte[FILE_SIZE]);
        compressedOut.write("\n".getBytes());
        compressedOut.flush();
        //outputStream.write(new byte[FILE_SIZE]);
        pathList.add(qualifiedPath);
    } finally {
        compressedOut.close();
        IOUtils.cleanup(null, outputStream);
        CodecPool.returnCompressor(gzipCompressor);
    }
}

From source file:dima.kmeansseq.SequenceFile.java

License:Apache License

/**
 * Construct the preferred type of SequenceFile Writer.
 * /*  w w w  . j a  v  a2 s .  c om*/
 * @param fs
 *            The configured filesystem.
 * @param conf
 *            The configuration.
 * @param name
 *            The name of the file.
 * @param keyClass
 *            The 'key' type.
 * @param valClass
 *            The 'value' type.
 * @param compressionType
 *            The compression type.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer createWriter(FileSystem fs, Configuration conf, Path name, Class keyClass, Class valClass,
        CompressionType compressionType) throws IOException {
    return createWriter(fs, conf, name, keyClass, valClass, fs.getConf().getInt("io.file.buffer.size", 4096),
            fs.getDefaultReplication(), fs.getDefaultBlockSize(), compressionType, new DefaultCodec(), null,
            new Metadata());
}

From source file:dima.kmeansseq.SequenceFile.java

License:Apache License

/**
 * Construct the preferred type of SequenceFile Writer.
 * //from w  ww.j a v  a2  s . c o m
 * @param fs
 *            The configured filesystem.
 * @param conf
 *            The configuration.
 * @param name
 *            The name of the file.
 * @param keyClass
 *            The 'key' type.
 * @param valClass
 *            The 'value' type.
 * @param compressionType
 *            The compression type.
 * @param progress
 *            The Progressable object to track progress.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer createWriter(FileSystem fs, Configuration conf, Path name, Class keyClass, Class valClass,
        CompressionType compressionType, Progressable progress) throws IOException {
    return createWriter(fs, conf, name, keyClass, valClass, fs.getConf().getInt("io.file.buffer.size", 4096),
            fs.getDefaultReplication(), fs.getDefaultBlockSize(), compressionType, new DefaultCodec(), progress,
            new Metadata());
}

From source file:dima.kmeansseq.SequenceFile.java

License:Apache License

/**
 * Construct the preferred type of SequenceFile Writer.
 * /*w  w w. j  a v a2s . co  m*/
 * @param fs
 *            The configured filesystem.
 * @param conf
 *            The configuration.
 * @param name
 *            The name of the file.
 * @param keyClass
 *            The 'key' type.
 * @param valClass
 *            The 'value' type.
 * @param compressionType
 *            The compression type.
 * @param codec
 *            The compression codec.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer createWriter(FileSystem fs, Configuration conf, Path name, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec) throws IOException {
    return createWriter(fs, conf, name, keyClass, valClass, fs.getConf().getInt("io.file.buffer.size", 4096),
            fs.getDefaultReplication(), fs.getDefaultBlockSize(), compressionType, codec, null, new Metadata());
}

From source file:dima.kmeansseq.SequenceFile.java

License:Apache License

/**
 * Construct the preferred type of SequenceFile Writer.
 * //  w  ww.  j  a va  2  s.c  o m
 * @param fs
 *            The configured filesystem.
 * @param conf
 *            The configuration.
 * @param name
 *            The name of the file.
 * @param keyClass
 *            The 'key' type.
 * @param valClass
 *            The 'value' type.
 * @param compressionType
 *            The compression type.
 * @param codec
 *            The compression codec.
 * @param progress
 *            The Progressable object to track progress.
 * @param metadata
 *            The metadata of the file.
 * @return Returns the handle to the constructed SequenceFile Writer.
 * @throws IOException
 */
public static Writer createWriter(FileSystem fs, Configuration conf, Path name, Class keyClass, Class valClass,
        CompressionType compressionType, CompressionCodec codec, Progressable progress, Metadata metadata)
        throws IOException {
    return createWriter(fs, conf, name, keyClass, valClass, fs.getConf().getInt("io.file.buffer.size", 4096),
            fs.getDefaultReplication(), fs.getDefaultBlockSize(), compressionType, codec, progress, metadata);
}

From source file:edu.umn.cs.spatialHadoop.operations.Repartition.java

License:Open Source License

public static void repartitionMapReduce(Path inFile, Path outPath, CellInfo[] cellInfos,
        OperationsParams params) throws IOException, InterruptedException {
    String sindex = params.get("sindex");
    boolean overwrite = params.getBoolean("overwrite", false);
    Shape stockShape = params.getShape("shape");

    FileSystem outFs = outPath.getFileSystem(params);

    // Calculate number of partitions in output file
    // Copy blocksize from source file if it's globally indexed
    @SuppressWarnings("deprecation")
    final long blockSize = outFs.getDefaultBlockSize();

    // Calculate the dimensions of each partition based on gindex type
    if (cellInfos == null) {
        if (sindex.equals("grid")) {
            Rectangle input_mbr = FileMBR.fileMBR(inFile, params);
            long inFileSize = FileMBR.sizeOfLastProcessedFile;
            int num_partitions = calculateNumberOfPartitions(new Configuration(), inFileSize, outFs, outPath,
                    blockSize);//from   w  ww .  java2 s .com

            GridInfo gridInfo = new GridInfo(input_mbr.x1, input_mbr.y1, input_mbr.x2, input_mbr.y2);
            gridInfo.calculateCellDimensions(num_partitions);
            cellInfos = gridInfo.getAllCells();
        } else if (sindex.equals("rtree") || sindex.equals("r+tree") || sindex.equals("str")
                || sindex.equals("str+")) {
            // Pack in rectangles using an RTree
            cellInfos = packInRectangles(inFile, outPath, params);
        } else {
            throw new RuntimeException("Unsupported spatial index: " + sindex);
        }
    }

    JobConf job = new JobConf(params, Repartition.class);

    job.setJobName("Repartition");

    // Overwrite output file
    if (outFs.exists(outPath)) {
        if (overwrite)
            outFs.delete(outPath, true);
        else
            throw new RuntimeException(
                    "Output file '" + outPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid, str+, and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(job, inFile);
    job.setInputFormat(ShapeInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    JobClient.runJob(job);

}