Example usage for org.apache.hadoop.fs FileSystem getStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem getStatus.

Prototype

public FsStatus getStatus() throws IOException

Source Link

Document

Returns a status object describing the use and capacity of the filesystem.

Usage

From source file:com.awcoleman.StandaloneJava.AvroCombinerByBlock.java

License:Apache License

public AvroCombinerByBlock(String inDirStr, String outDirStr, String handleExisting) throws IOException {

    //handle both an output directory and an output filename (ending with .avro)
    String outputFilename = DEFAULTOUTPUTFILENAME;
    if (outDirStr.endsWith(".avro")) {
        isOutputNameSpecifiedAndAFile = true;
        //String[] outputParts = outDirStr.split(":?\\\\");
        String[] outputParts = outDirStr.split("/");

        outputFilename = outputParts[outputParts.length - 1];

        //remove outputFilename from outDirStr to get new outDirStr which is just directory (and trailing /)
        outDirStr = outDirStr.replaceAll(Pattern.quote(outputFilename), "");
        outDirStr = outDirStr.substring(0, outDirStr.length() - (outDirStr.endsWith("/") ? 1 : 0));
    }//from  w  w w. j a v  a  2  s .c o  m

    //Get block size - not needed
    //long hdfsBlockSize = getBlockSize();
    //System.out.println("HDFS FS block size: "+hdfsBlockSize);

    //Get list of input files
    ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

    Configuration conf = new Configuration();
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
    conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less

    FileSystem hdfs = null;
    try {
        hdfs = FileSystem.get(conf);
    } catch (java.io.IOException ioe) {
        System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage());
        System.exit(1);
    }
    if (hdfs.getStatus() == null) {
        System.out.println("Unable to contact HDFS filesystem. Exiting.");
        System.exit(1);
    }

    //Check if input and output dirs exist
    Path inDir = new Path(inDirStr);
    Path outDir = new Path(outDirStr);
    if (!(hdfs.exists(inDir) || hdfs.isDirectory(inDir))) {
        System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
        System.exit(1);
    }

    if (!(hdfs.exists(outDir) || hdfs.isDirectory(outDir))) {
        if (hdfs.exists(outDir)) { //outDir exists and is a symlink or file, must die
            System.out.println("Requested output directory name ( " + outDirStr
                    + " ) exists but is not a directory. Exiting.");
            System.exit(1);
        } else {
            hdfs.mkdirs(outDir);
        }
    }

    RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inDir, true);
    while (fileStatusListIterator.hasNext()) {
        LocatedFileStatus fileStatus = fileStatusListIterator.next();

        if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) {
            inputFileList.add((FileStatus) fileStatus);
        }
    }

    if (inputFileList.size() <= 1 && !isOutputNameSpecifiedAndAFile) { //If an output file is specified assume we just want a rename.
        System.out.println("Only one or zero files found in input directory ( " + inDirStr + " ). Exiting.");
        System.exit(1);
    }

    //Get Schema and Compression Codec from seed file since we need it for the writer
    Path firstFile = inputFileList.get(0).getPath();
    FsInput fsin = new FsInput(firstFile, conf);
    DataFileReader<Object> dfrFirstFile = new DataFileReader<Object>(fsin, new GenericDatumReader<Object>());
    Schema fileSchema = dfrFirstFile.getSchema();
    String compCodecName = dfrFirstFile.getMetaString("avro.codec");
    //compCodecName should be null, deflate, snappy, or bzip2
    if (compCodecName == null) {
        compCodecName = "deflate"; //set to deflate even though original is no compression
    }
    dfrFirstFile.close();

    //Create Empty HDFS file in output dir
    String seedFileStr = outDirStr + "/" + outputFilename;
    Path seedFile = new Path(seedFileStr);
    FSDataOutputStream hdfsdos = null;
    try {
        hdfsdos = hdfs.create(seedFile, false);
    } catch (org.apache.hadoop.fs.FileAlreadyExistsException faee) {
        if (handleExisting.equals("overwrite")) {
            hdfs.delete(seedFile, false);
            hdfsdos = hdfs.create(seedFile, false);
        } else if (handleExisting.equals("append")) {
            hdfsdos = hdfs.append(seedFile);
        } else {
            System.out
                    .println("File " + seedFileStr + " exists and will not overwrite. handleExisting is set to "
                            + handleExisting + ". Exiting.");
            System.exit(1);
        }
    }
    if (hdfsdos == null) {
        System.out.println("Unable to create or write to output file ( " + seedFileStr
                + " ). handleExisting is set to " + handleExisting + ". Exiting.");
        System.exit(1);
    }

    //Append other files
    GenericDatumWriter gdw = new GenericDatumWriter(fileSchema);
    DataFileWriter dfwBase = new DataFileWriter(gdw);
    //Set compression to that found in the first file
    dfwBase.setCodec(CodecFactory.fromString(compCodecName));

    DataFileWriter dfw = dfwBase.create(fileSchema, hdfsdos);
    for (FileStatus thisFileStatus : inputFileList) {

        //_SUCCESS files are 0 bytes
        if (thisFileStatus.getLen() == 0) {
            continue;
        }

        FsInput fsin1 = new FsInput(thisFileStatus.getPath(), conf);
        DataFileReader dfr = new DataFileReader<Object>(fsin1, new GenericDatumReader<Object>());

        dfw.appendAllFrom(dfr, false);

        dfr.close();
    }

    dfw.close();
    dfwBase.close();

}

From source file:com.awcoleman.StandaloneJava.AvroCounterByBlock.java

License:Apache License

public AvroCounterByBlock(String inDirStr) throws IOException {

    long numAvroRecords = 0;

    //Get list of input files
    ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

    Configuration conf = new Configuration();
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
    conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less

    FileSystem hdfs = null;
    try {/*from w w  w .jav a  2s  .c om*/
        hdfs = FileSystem.get(conf);
    } catch (java.io.IOException ioe) {
        System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage());
        System.exit(1);
    }
    if (hdfs.getStatus() == null) {
        System.out.println("Unable to contact HDFS filesystem. Exiting.");
        System.exit(1);
    }

    //Check if input dirs/file exists and get file list (even if list of single file)
    Path inPath = new Path(inDirStr);
    if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file
        inputFileList.add(hdfs.getFileStatus(inPath));
    } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir
        //Get list of input files
        RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true);
        while (fileStatusListIterator.hasNext()) {
            LocatedFileStatus fileStatus = fileStatusListIterator.next();

            if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) {
                inputFileList.add((FileStatus) fileStatus);
            }
        }
    } else {
        System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
        System.exit(1);
    }

    for (FileStatus thisFileStatus : inputFileList) {

        //_SUCCESS files are 0 bytes
        if (thisFileStatus.getLen() == 0) {
            continue;
        }

        DataFileStream<Object> dfs = null;
        FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath());
        GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
        dfs = new DataFileStream<Object>(inStream, reader);

        long thisFileRecords = 0;
        while (dfs.hasNext()) {

            numAvroRecords = numAvroRecords + dfs.getBlockCount();
            thisFileRecords = thisFileRecords + dfs.getBlockCount();

            //System.out.println("Input file "+thisFileStatus.getPath()+" getBlockCount() is "+dfs.getBlockCount()+"." );

            dfs.nextBlock();
        }

        System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records.");

        dfs.close();
        inStream.close();

        //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die.
    }

    System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and "
            + numAvroRecords + " total records.");

}

From source file:com.awcoleman.StandaloneJava.AvroCounterByRecord.java

License:Apache License

public AvroCounterByRecord(String inDirStr) throws IOException {

    long numAvroRecords = 0;

    //Get list of input files
    ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

    Configuration conf = new Configuration();
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
    conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less

    FileSystem hdfs = null;
    try {//from   w  w w.  jav a  2  s  . com
        hdfs = FileSystem.get(conf);
    } catch (java.io.IOException ioe) {
        System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage());
        System.exit(1);
    }
    if (hdfs.getStatus() == null) {
        System.out.println("Unable to contact HDFS filesystem. Exiting.");
        System.exit(1);
    }

    //Check if input dirs/file exists and get file list (even if list of single file)
    Path inPath = new Path(inDirStr);
    if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file
        inputFileList.add(hdfs.getFileStatus(inPath));
    } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir
        //Get list of input files
        RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true);
        while (fileStatusListIterator.hasNext()) {
            LocatedFileStatus fileStatus = fileStatusListIterator.next();

            if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) {
                inputFileList.add((FileStatus) fileStatus);
            }
        }
    } else {
        System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
        System.exit(1);
    }

    for (FileStatus thisFileStatus : inputFileList) {

        //_SUCCESS files are 0 bytes
        if (thisFileStatus.getLen() == 0) {
            continue;
        }

        DataFileStream<Object> avroStream = null;
        FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath());
        GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
        avroStream = new DataFileStream<Object>(inStream, reader);

        long thisFileRecords = 0;

        while (avroStream.hasNext()) {
            numAvroRecords++;
            thisFileRecords++;
            avroStream.next();
        }
        avroStream.close();
        inStream.close();

        System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records.");

        //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die.
    }

    System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and "
            + numAvroRecords + " total records.");

}

From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.opt.OptimizerRuleBased.java

License:Open Source License

/**
 * Increasing the partition replication factor is beneficial if partitions are
 * read multiple times (e.g., in nested loops) because partitioning (done once)
 * gets slightly slower but there is a higher probability for local access
 * //from  w w  w  .j  a v  a2  s.c  o  m
 * NOTE: this rewrite requires 'set data partitioner' to be executed in order to
 * leverage the partitioning information in the plan tree. 
 *  
 * @param n
 * @throws DMLRuntimeException 
 */
protected void rewriteSetPartitionReplicationFactor(OptNode n,
        HashMap<String, PDataPartitionFormat> partitionedMatrices, LocalVariableMap vars)
        throws DMLRuntimeException {
    boolean apply = false;
    double sizeReplicated = 0;
    int replication = ParForProgramBlock.WRITE_REPLICATION_FACTOR;

    ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping()
            .getMappedProg(n.getID())[1];

    if (n.getExecType() == ExecType.MR
            && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_MR.toString())
            && n.hasNestedParallelism(false) && n.hasNestedPartitionReads(false)) {
        apply = true;

        //account for problem and cluster constraints
        replication = (int) Math.min(_N, _rnk);

        //account for internal max constraint (note hadoop will warn if max > 10)
        replication = (int) Math.min(replication, MAX_REPLICATION_FACTOR_EXPORT);

        //account for remaining hdfs capacity
        try {
            FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
            long hdfsCapacityRemain = fs.getStatus().getRemaining();
            long sizeInputs = 0; //sum of all input sizes (w/o replication)
            for (String var : partitionedMatrices.keySet()) {
                MatrixObject mo = (MatrixObject) vars.get(var);
                Path fname = new Path(mo.getFileName());
                if (fs.exists(fname)) //non-existing (e.g., CP) -> small file
                    sizeInputs += fs.getContentSummary(fname).getLength();
            }
            replication = (int) Math.min(replication, Math.floor(0.9 * hdfsCapacityRemain / sizeInputs));

            //ensure at least replication 1
            replication = Math.max(replication, ParForProgramBlock.WRITE_REPLICATION_FACTOR);
            sizeReplicated = replication * sizeInputs;
        } catch (Exception ex) {
            throw new DMLRuntimeException("Failed to analyze remaining hdfs capacity.", ex);
        }
    }

    //modify the runtime plan 
    if (apply)
        pfpb.setPartitionReplicationFactor(replication);

    _numEvaluatedPlans++;
    LOG.debug(getOptMode() + " OPT: rewrite 'set partition replication factor' - result=" + apply
            + ((apply) ? " (" + replication + ", " + toMB(sizeReplicated) + ")" : ""));
}

From source file:fuse4j.hadoopfs.HdfsClientImpl.java

License:Apache License

@Override
public FuseStatfs getStatus(int uid) {
    FileSystem dfs = null;
    try {//from w w  w. j a va  2s  .  co  m
        dfs = getDfs(uid);
        FsStatus status = dfs.getStatus();
        long cap = status.getCapacity();
        long bsize = dfs.getDefaultBlockSize();
        long used = status.getUsed();

        FuseStatfs statFS = new FuseStatfs();
        statFS.blockSize = (int) bsize;
        statFS.blocks = (int) (cap / bsize);
        statFS.blocksFree = (int) ((cap - used) / bsize);
        statFS.blocksAvail = (int) ((cap - used) / bsize);
        statFS.files = 1000;
        statFS.filesFree = 500;
        statFS.namelen = 1023;
        return statFS;
    } catch (Exception e) {
        e.printStackTrace();
        return null;
    }
}

From source file:org.apache.falcon.service.SharedLibraryHostingService.java

License:Apache License

private FileSystem getFilesystem(final Cluster cluster) throws FalconException {
    FileSystem fs;
    try {/*from   w ww  .j  a v  a 2s .  co  m*/
        LOG.info("Initializing FS: {} for cluster: {}", ClusterHelper.getStorageUrl(cluster),
                cluster.getName());
        fs = HadoopClientFactory.get().createFalconFileSystem(ClusterHelper.getConfiguration(cluster));
        fs.getStatus();
        return fs;
    } catch (Exception e) {
        throw new FalconException("Failed to initialize FS for cluster : " + cluster.getName(), e);
    }
}

From source file:org.apache.sysml.runtime.controlprogram.parfor.opt.OptimizerRuleBased.java

License:Apache License

/**
 * Increasing the partition replication factor is beneficial if partitions are
 * read multiple times (e.g., in nested loops) because partitioning (done once)
 * gets slightly slower but there is a higher probability for local access
 * /*from w w w . j a  v a2  s. co  m*/
 * NOTE: this rewrite requires 'set data partitioner' to be executed in order to
 * leverage the partitioning information in the plan tree. 
 * 
 * @param n internal representation of a plan alternative for program blocks and instructions
 * @param partitionedMatrices map of data partition formats
 * @param vars local variable map
 * @throws DMLRuntimeException if DMLRuntimeException occurs
 */
protected void rewriteSetPartitionReplicationFactor(OptNode n,
        HashMap<String, PartitionFormat> partitionedMatrices, LocalVariableMap vars)
        throws DMLRuntimeException {
    boolean apply = false;
    double sizeReplicated = 0;
    int replication = ParForProgramBlock.WRITE_REPLICATION_FACTOR;

    ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping()
            .getMappedProg(n.getID())[1];

    if (((n.getExecType() == ExecType.MR
            && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_MR.name()))
            || (n.getExecType() == ExecType.SPARK
                    && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_SPARK.name())))
            && n.hasNestedParallelism(false) && n.hasNestedPartitionReads(false)) {
        apply = true;

        //account for problem and cluster constraints
        replication = (int) Math.min(_N, _rnk);

        //account for internal max constraint (note hadoop will warn if max > 10)
        replication = (int) Math.min(replication, MAX_REPLICATION_FACTOR_PARTITIONING);

        //account for remaining hdfs capacity
        try {
            FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf());
            long hdfsCapacityRemain = fs.getStatus().getRemaining();
            long sizeInputs = 0; //sum of all input sizes (w/o replication)
            for (String var : partitionedMatrices.keySet()) {
                MatrixObject mo = (MatrixObject) vars.get(var);
                Path fname = new Path(mo.getFileName());
                if (fs.exists(fname)) //non-existing (e.g., CP) -> small file
                    sizeInputs += fs.getContentSummary(fname).getLength();
            }
            replication = (int) Math.min(replication, Math.floor(0.9 * hdfsCapacityRemain / sizeInputs));

            //ensure at least replication 1
            replication = Math.max(replication, ParForProgramBlock.WRITE_REPLICATION_FACTOR);
            sizeReplicated = replication * sizeInputs;
        } catch (Exception ex) {
            throw new DMLRuntimeException("Failed to analyze remaining hdfs capacity.", ex);
        }
    }

    //modify the runtime plan 
    if (apply)
        pfpb.setPartitionReplicationFactor(replication);

    _numEvaluatedPlans++;
    LOG.debug(getOptMode() + " OPT: rewrite 'set partition replication factor' - result=" + apply
            + ((apply) ? " (" + replication + ", " + toMB(sizeReplicated) + ")" : ""));
}