Example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize

List of usage examples for org.apache.hadoop.fs FileSystem getDefaultBlockSize

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getDefaultBlockSize.

Prototype

public long getDefaultBlockSize(Path f) 

Source Link

Document

Return the number of bytes that large input files should be optimally be split into to minimize I/O time.

Usage

From source file:org.apache.nifi.processors.hadoop.PutHDFS.java

License:Apache License

@Override
public void onTrigger(ProcessContext context, ProcessSession session) throws ProcessException {
    final FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;/*from ww w  .j  a va 2  s . c  om*/
    }

    final FileSystem hdfs = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();

    if (configuration == null || hdfs == null || ugi == null) {
        getLogger().error("HDFS not configured properly");
        session.transfer(flowFile, REL_FAILURE);
        context.yield();
        return;
    }

    ugi.doAs(new PrivilegedAction<Object>() {
        @Override
        public Object run() {
            Path tempDotCopyFile = null;
            FlowFile putFlowFile = flowFile;
            try {
                final String dirValue = context.getProperty(DIRECTORY).evaluateAttributeExpressions(putFlowFile)
                        .getValue();
                final Path configuredRootDirPath = new Path(dirValue);

                final String conflictResponse = context.getProperty(CONFLICT_RESOLUTION).getValue();

                final Double blockSizeProp = context.getProperty(BLOCK_SIZE).asDataSize(DataUnit.B);
                final long blockSize = blockSizeProp != null ? blockSizeProp.longValue()
                        : hdfs.getDefaultBlockSize(configuredRootDirPath);

                final Double bufferSizeProp = context.getProperty(BUFFER_SIZE).asDataSize(DataUnit.B);
                final int bufferSize = bufferSizeProp != null ? bufferSizeProp.intValue()
                        : configuration.getInt(BUFFER_SIZE_KEY, BUFFER_SIZE_DEFAULT);

                final Integer replicationProp = context.getProperty(REPLICATION_FACTOR).asInteger();
                final short replication = replicationProp != null ? replicationProp.shortValue()
                        : hdfs.getDefaultReplication(configuredRootDirPath);

                final CompressionCodec codec = getCompressionCodec(context, configuration);

                final String filename = codec != null
                        ? putFlowFile.getAttribute(CoreAttributes.FILENAME.key()) + codec.getDefaultExtension()
                        : putFlowFile.getAttribute(CoreAttributes.FILENAME.key());

                final Path tempCopyFile = new Path(configuredRootDirPath, "." + filename);
                final Path copyFile = new Path(configuredRootDirPath, filename);

                // Create destination directory if it does not exist
                try {
                    if (!hdfs.getFileStatus(configuredRootDirPath).isDirectory()) {
                        throw new IOException(
                                configuredRootDirPath.toString() + " already exists and is not a directory");
                    }
                } catch (FileNotFoundException fe) {
                    if (!hdfs.mkdirs(configuredRootDirPath)) {
                        throw new IOException(configuredRootDirPath.toString() + " could not be created");
                    }
                    changeOwner(context, hdfs, configuredRootDirPath, flowFile);
                }

                final boolean destinationExists = hdfs.exists(copyFile);

                // If destination file already exists, resolve that based on processor configuration
                if (destinationExists) {
                    switch (conflictResponse) {
                    case REPLACE_RESOLUTION:
                        if (hdfs.delete(copyFile, false)) {
                            getLogger().info("deleted {} in order to replace with the contents of {}",
                                    new Object[] { copyFile, putFlowFile });
                        }
                        break;
                    case IGNORE_RESOLUTION:
                        session.transfer(putFlowFile, REL_SUCCESS);
                        getLogger().info(
                                "transferring {} to success because file with same name already exists",
                                new Object[] { putFlowFile });
                        return null;
                    case FAIL_RESOLUTION:
                        session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                        getLogger().warn(
                                "penalizing {} and routing to failure because file with same name already exists",
                                new Object[] { putFlowFile });
                        return null;
                    default:
                        break;
                    }
                }

                // Write FlowFile to temp file on HDFS
                final StopWatch stopWatch = new StopWatch(true);
                session.read(putFlowFile, new InputStreamCallback() {

                    @Override
                    public void process(InputStream in) throws IOException {
                        OutputStream fos = null;
                        Path createdFile = null;
                        try {
                            if (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && destinationExists) {
                                fos = hdfs.append(copyFile, bufferSize);
                            } else {
                                fos = hdfs.create(tempCopyFile, true, bufferSize, replication, blockSize);
                            }
                            if (codec != null) {
                                fos = codec.createOutputStream(fos);
                            }
                            createdFile = tempCopyFile;
                            BufferedInputStream bis = new BufferedInputStream(in);
                            StreamUtils.copy(bis, fos);
                            bis = null;
                            fos.flush();
                        } finally {
                            try {
                                if (fos != null) {
                                    fos.close();
                                }
                            } catch (RemoteException re) {
                                // when talking to remote HDFS clusters, we don't notice problems until fos.close()
                                if (createdFile != null) {
                                    try {
                                        hdfs.delete(createdFile, false);
                                    } catch (Throwable ignore) {
                                    }
                                }
                                throw re;
                            } catch (Throwable ignore) {
                            }
                            fos = null;
                        }
                    }

                });
                stopWatch.stop();
                final String dataRate = stopWatch.calculateDataRate(putFlowFile.getSize());
                final long millis = stopWatch.getDuration(TimeUnit.MILLISECONDS);
                tempDotCopyFile = tempCopyFile;

                if (!conflictResponse.equals(APPEND_RESOLUTION_AV.getValue())
                        || (conflictResponse.equals(APPEND_RESOLUTION_AV.getValue()) && !destinationExists)) {
                    boolean renamed = false;
                    for (int i = 0; i < 10; i++) { // try to rename multiple times.
                        if (hdfs.rename(tempCopyFile, copyFile)) {
                            renamed = true;
                            break;// rename was successful
                        }
                        Thread.sleep(200L);// try waiting to let whatever might cause rename failure to resolve
                    }
                    if (!renamed) {
                        hdfs.delete(tempCopyFile, false);
                        throw new ProcessException("Copied file to HDFS but could not rename dot file "
                                + tempCopyFile + " to its final filename");
                    }

                    changeOwner(context, hdfs, copyFile, flowFile);
                }

                getLogger().info("copied {} to HDFS at {} in {} milliseconds at a rate of {}",
                        new Object[] { putFlowFile, copyFile, millis, dataRate });

                final String newFilename = copyFile.getName();
                final String hdfsPath = copyFile.getParent().toString();
                putFlowFile = session.putAttribute(putFlowFile, CoreAttributes.FILENAME.key(), newFilename);
                putFlowFile = session.putAttribute(putFlowFile, ABSOLUTE_HDFS_PATH_ATTRIBUTE, hdfsPath);
                final Path qualifiedPath = copyFile.makeQualified(hdfs.getUri(), hdfs.getWorkingDirectory());
                session.getProvenanceReporter().send(putFlowFile, qualifiedPath.toString());

                session.transfer(putFlowFile, REL_SUCCESS);

            } catch (final Throwable t) {
                if (tempDotCopyFile != null) {
                    try {
                        hdfs.delete(tempDotCopyFile, false);
                    } catch (Exception e) {
                        getLogger().error("Unable to remove temporary file {} due to {}",
                                new Object[] { tempDotCopyFile, e });
                    }
                }
                getLogger().error("Failed to write to HDFS due to {}", new Object[] { t });
                session.transfer(session.penalize(putFlowFile), REL_FAILURE);
                context.yield();
            }

            return null;
        }
    });
}

From source file:org.apache.parquet.hadoop.ParquetFileWriter.java

License:Apache License

/**
 * @param configuration Hadoop configuration
 * @param schema the schema of the data//www.  jav  a  2s  . c om
 * @param file the file to write to
 * @param mode file creation mode
 * @param rowGroupSize the row group size
 * @throws IOException if the file can not be created
 */
public ParquetFileWriter(Configuration configuration, MessageType schema, Path file, Mode mode,
        long rowGroupSize, int maxPaddingSize) throws IOException {
    TypeUtil.checkValidWriteSchema(schema);
    this.schema = schema;
    FileSystem fs = file.getFileSystem(configuration);
    boolean overwriteFlag = (mode == Mode.OVERWRITE);

    if (supportsBlockSize(fs)) {
        // use the default block size, unless row group size is larger
        long dfsBlockSize = Math.max(fs.getDefaultBlockSize(file), rowGroupSize);

        this.alignment = PaddingAlignment.get(dfsBlockSize, rowGroupSize, maxPaddingSize);
        this.out = fs.create(file, overwriteFlag, DFS_BUFFER_SIZE_DEFAULT, fs.getDefaultReplication(file),
                dfsBlockSize);

    } else {
        this.alignment = NoAlignment.get(rowGroupSize);
        this.out = fs.create(file, overwriteFlag);
    }
}

From source file:org.apache.pig.backend.hadoop.executionengine.shims.HadoopShims.java

License:Apache License

public static long getDefaultBlockSize(FileSystem fs, Path path) {
    return fs.getDefaultBlockSize(path);
}

From source file:org.mrgeo.hdfs.output.image.HdfsMrsImagePyramidOutputFormatProvider.java

License:Apache License

private void setupSingleOutput(final Job job, final String outputWithZoom) throws IOException {
    job.setOutputFormatClass(HdfsMrsPyramidOutputFormat.class);
    HdfsMrsPyramidOutputFormat.setOutputInfo(job, outputWithZoom);

    final Configuration conf = job.getConfiguration();
    final FileSystem fs = HadoopFileUtils.getFileSystem(conf);

    // Set up partitioner
    final int tilesize = context.getTilesize();
    final int zoom = context.getZoomlevel();
    final Bounds bounds = context.getBounds();

    final LongRectangle tileBounds = TMSUtils.boundsToTile(bounds.getTMSBounds(), zoom, tilesize)
            .toLongRectangle();//www.  ja va2  s.  c o  m

    final int increment = conf.getInt(TileIdPartitioner.INCREMENT_KEY, -1);
    if (increment != -1) {
        // if increment is provided, use it to setup the partitioner
        splitFileTmp = TileIdPartitioner.setup(job, new ImageSplitGenerator(tileBounds.getMinX(),
                tileBounds.getMinY(), tileBounds.getMaxX(), tileBounds.getMaxY(), zoom, increment));
    } else if (!context.isCalculatePartitions()) {
        // can't calculate partitions on size, just use increment of 1 (1 row per partition)
        splitFileTmp = TileIdPartitioner.setup(job, new ImageSplitGenerator(tileBounds.getMinX(),
                tileBounds.getMinY(), tileBounds.getMaxX(), tileBounds.getMaxY(), zoom, 1));
    } else {

        final int bands = context.getBands();
        final int tiletype = context.getTiletype();

        final int tileSizeBytes = tilesize * tilesize * bands * RasterUtils.getElementSize(tiletype);

        // if increment is not provided, set up the partitioner using max partitions
        final String strMaxPartitions = conf.get(TileIdPartitioner.MAX_PARTITIONS_KEY);
        if (strMaxPartitions != null) {
            // We know the max partitions conf setting exists, let's go read it. The
            // 1000 hard-coded default value is never used.
            final int maxPartitions = conf.getInt(TileIdPartitioner.MAX_PARTITIONS_KEY, 1000);
            splitFileTmp = TileIdPartitioner.setup(job, new ImageSplitGenerator(tileBounds, zoom, tileSizeBytes,
                    fs.getDefaultBlockSize(new Path("/")), maxPartitions));
        } else {
            splitFileTmp = TileIdPartitioner.setup(job, new ImageSplitGenerator(tileBounds, zoom, tileSizeBytes,
                    fs.getDefaultBlockSize(new Path("/"))));
        }
    }
}