List of usage examples for org.apache.hadoop.fs FileSystem getStatus
public FsStatus getStatus() throws IOException
From source file:com.awcoleman.StandaloneJava.AvroCombinerByBlock.java
License:Apache License
public AvroCombinerByBlock(String inDirStr, String outDirStr, String handleExisting) throws IOException { //handle both an output directory and an output filename (ending with .avro) String outputFilename = DEFAULTOUTPUTFILENAME; if (outDirStr.endsWith(".avro")) { isOutputNameSpecifiedAndAFile = true; //String[] outputParts = outDirStr.split(":?\\\\"); String[] outputParts = outDirStr.split("/"); outputFilename = outputParts[outputParts.length - 1]; //remove outputFilename from outDirStr to get new outDirStr which is just directory (and trailing /) outDirStr = outDirStr.replaceAll(Pattern.quote(outputFilename), ""); outDirStr = outDirStr.substring(0, outDirStr.length() - (outDirStr.endsWith("/") ? 1 : 0)); }//from w w w. j a v a 2 s .c o m //Get block size - not needed //long hdfsBlockSize = getBlockSize(); //System.out.println("HDFS FS block size: "+hdfsBlockSize); //Get list of input files ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>(); Configuration conf = new Configuration(); conf.addResource(new Path("/etc/hadoop/conf/core-site.xml")); conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less FileSystem hdfs = null; try { hdfs = FileSystem.get(conf); } catch (java.io.IOException ioe) { System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage()); System.exit(1); } if (hdfs.getStatus() == null) { System.out.println("Unable to contact HDFS filesystem. Exiting."); System.exit(1); } //Check if input and output dirs exist Path inDir = new Path(inDirStr); Path outDir = new Path(outDirStr); if (!(hdfs.exists(inDir) || hdfs.isDirectory(inDir))) { System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting."); System.exit(1); } if (!(hdfs.exists(outDir) || hdfs.isDirectory(outDir))) { if (hdfs.exists(outDir)) { //outDir exists and is a symlink or file, must die System.out.println("Requested output directory name ( " + outDirStr + " ) exists but is not a directory. Exiting."); System.exit(1); } else { hdfs.mkdirs(outDir); } } RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inDir, true); while (fileStatusListIterator.hasNext()) { LocatedFileStatus fileStatus = fileStatusListIterator.next(); if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) { inputFileList.add((FileStatus) fileStatus); } } if (inputFileList.size() <= 1 && !isOutputNameSpecifiedAndAFile) { //If an output file is specified assume we just want a rename. System.out.println("Only one or zero files found in input directory ( " + inDirStr + " ). Exiting."); System.exit(1); } //Get Schema and Compression Codec from seed file since we need it for the writer Path firstFile = inputFileList.get(0).getPath(); FsInput fsin = new FsInput(firstFile, conf); DataFileReader<Object> dfrFirstFile = new DataFileReader<Object>(fsin, new GenericDatumReader<Object>()); Schema fileSchema = dfrFirstFile.getSchema(); String compCodecName = dfrFirstFile.getMetaString("avro.codec"); //compCodecName should be null, deflate, snappy, or bzip2 if (compCodecName == null) { compCodecName = "deflate"; //set to deflate even though original is no compression } dfrFirstFile.close(); //Create Empty HDFS file in output dir String seedFileStr = outDirStr + "/" + outputFilename; Path seedFile = new Path(seedFileStr); FSDataOutputStream hdfsdos = null; try { hdfsdos = hdfs.create(seedFile, false); } catch (org.apache.hadoop.fs.FileAlreadyExistsException faee) { if (handleExisting.equals("overwrite")) { hdfs.delete(seedFile, false); hdfsdos = hdfs.create(seedFile, false); } else if (handleExisting.equals("append")) { hdfsdos = hdfs.append(seedFile); } else { System.out .println("File " + seedFileStr + " exists and will not overwrite. handleExisting is set to " + handleExisting + ". Exiting."); System.exit(1); } } if (hdfsdos == null) { System.out.println("Unable to create or write to output file ( " + seedFileStr + " ). handleExisting is set to " + handleExisting + ". Exiting."); System.exit(1); } //Append other files GenericDatumWriter gdw = new GenericDatumWriter(fileSchema); DataFileWriter dfwBase = new DataFileWriter(gdw); //Set compression to that found in the first file dfwBase.setCodec(CodecFactory.fromString(compCodecName)); DataFileWriter dfw = dfwBase.create(fileSchema, hdfsdos); for (FileStatus thisFileStatus : inputFileList) { //_SUCCESS files are 0 bytes if (thisFileStatus.getLen() == 0) { continue; } FsInput fsin1 = new FsInput(thisFileStatus.getPath(), conf); DataFileReader dfr = new DataFileReader<Object>(fsin1, new GenericDatumReader<Object>()); dfw.appendAllFrom(dfr, false); dfr.close(); } dfw.close(); dfwBase.close(); }
From source file:com.awcoleman.StandaloneJava.AvroCounterByBlock.java
License:Apache License
public AvroCounterByBlock(String inDirStr) throws IOException { long numAvroRecords = 0; //Get list of input files ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>(); Configuration conf = new Configuration(); conf.addResource(new Path("/etc/hadoop/conf/core-site.xml")); conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less FileSystem hdfs = null; try {/*from w w w .jav a 2s .c om*/ hdfs = FileSystem.get(conf); } catch (java.io.IOException ioe) { System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage()); System.exit(1); } if (hdfs.getStatus() == null) { System.out.println("Unable to contact HDFS filesystem. Exiting."); System.exit(1); } //Check if input dirs/file exists and get file list (even if list of single file) Path inPath = new Path(inDirStr); if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file inputFileList.add(hdfs.getFileStatus(inPath)); } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir //Get list of input files RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true); while (fileStatusListIterator.hasNext()) { LocatedFileStatus fileStatus = fileStatusListIterator.next(); if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) { inputFileList.add((FileStatus) fileStatus); } } } else { System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting."); System.exit(1); } for (FileStatus thisFileStatus : inputFileList) { //_SUCCESS files are 0 bytes if (thisFileStatus.getLen() == 0) { continue; } DataFileStream<Object> dfs = null; FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath()); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); dfs = new DataFileStream<Object>(inStream, reader); long thisFileRecords = 0; while (dfs.hasNext()) { numAvroRecords = numAvroRecords + dfs.getBlockCount(); thisFileRecords = thisFileRecords + dfs.getBlockCount(); //System.out.println("Input file "+thisFileStatus.getPath()+" getBlockCount() is "+dfs.getBlockCount()+"." ); dfs.nextBlock(); } System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records."); dfs.close(); inStream.close(); //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die. } System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and " + numAvroRecords + " total records."); }
From source file:com.awcoleman.StandaloneJava.AvroCounterByRecord.java
License:Apache License
public AvroCounterByRecord(String inDirStr) throws IOException { long numAvroRecords = 0; //Get list of input files ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>(); Configuration conf = new Configuration(); conf.addResource(new Path("/etc/hadoop/conf/core-site.xml")); conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less FileSystem hdfs = null; try {//from w w w. jav a 2 s . com hdfs = FileSystem.get(conf); } catch (java.io.IOException ioe) { System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage()); System.exit(1); } if (hdfs.getStatus() == null) { System.out.println("Unable to contact HDFS filesystem. Exiting."); System.exit(1); } //Check if input dirs/file exists and get file list (even if list of single file) Path inPath = new Path(inDirStr); if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file inputFileList.add(hdfs.getFileStatus(inPath)); } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir //Get list of input files RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true); while (fileStatusListIterator.hasNext()) { LocatedFileStatus fileStatus = fileStatusListIterator.next(); if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) { inputFileList.add((FileStatus) fileStatus); } } } else { System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting."); System.exit(1); } for (FileStatus thisFileStatus : inputFileList) { //_SUCCESS files are 0 bytes if (thisFileStatus.getLen() == 0) { continue; } DataFileStream<Object> avroStream = null; FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath()); GenericDatumReader<Object> reader = new GenericDatumReader<Object>(); avroStream = new DataFileStream<Object>(inStream, reader); long thisFileRecords = 0; while (avroStream.hasNext()) { numAvroRecords++; thisFileRecords++; avroStream.next(); } avroStream.close(); inStream.close(); System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records."); //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die. } System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and " + numAvroRecords + " total records."); }
From source file:com.ibm.bi.dml.runtime.controlprogram.parfor.opt.OptimizerRuleBased.java
License:Open Source License
/** * Increasing the partition replication factor is beneficial if partitions are * read multiple times (e.g., in nested loops) because partitioning (done once) * gets slightly slower but there is a higher probability for local access * //from w w w .j a v a2 s.c o m * NOTE: this rewrite requires 'set data partitioner' to be executed in order to * leverage the partitioning information in the plan tree. * * @param n * @throws DMLRuntimeException */ protected void rewriteSetPartitionReplicationFactor(OptNode n, HashMap<String, PDataPartitionFormat> partitionedMatrices, LocalVariableMap vars) throws DMLRuntimeException { boolean apply = false; double sizeReplicated = 0; int replication = ParForProgramBlock.WRITE_REPLICATION_FACTOR; ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping() .getMappedProg(n.getID())[1]; if (n.getExecType() == ExecType.MR && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_MR.toString()) && n.hasNestedParallelism(false) && n.hasNestedPartitionReads(false)) { apply = true; //account for problem and cluster constraints replication = (int) Math.min(_N, _rnk); //account for internal max constraint (note hadoop will warn if max > 10) replication = (int) Math.min(replication, MAX_REPLICATION_FACTOR_EXPORT); //account for remaining hdfs capacity try { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); long hdfsCapacityRemain = fs.getStatus().getRemaining(); long sizeInputs = 0; //sum of all input sizes (w/o replication) for (String var : partitionedMatrices.keySet()) { MatrixObject mo = (MatrixObject) vars.get(var); Path fname = new Path(mo.getFileName()); if (fs.exists(fname)) //non-existing (e.g., CP) -> small file sizeInputs += fs.getContentSummary(fname).getLength(); } replication = (int) Math.min(replication, Math.floor(0.9 * hdfsCapacityRemain / sizeInputs)); //ensure at least replication 1 replication = Math.max(replication, ParForProgramBlock.WRITE_REPLICATION_FACTOR); sizeReplicated = replication * sizeInputs; } catch (Exception ex) { throw new DMLRuntimeException("Failed to analyze remaining hdfs capacity.", ex); } } //modify the runtime plan if (apply) pfpb.setPartitionReplicationFactor(replication); _numEvaluatedPlans++; LOG.debug(getOptMode() + " OPT: rewrite 'set partition replication factor' - result=" + apply + ((apply) ? " (" + replication + ", " + toMB(sizeReplicated) + ")" : "")); }
From source file:fuse4j.hadoopfs.HdfsClientImpl.java
License:Apache License
@Override public FuseStatfs getStatus(int uid) { FileSystem dfs = null; try {//from w w w. j a va 2s . co m dfs = getDfs(uid); FsStatus status = dfs.getStatus(); long cap = status.getCapacity(); long bsize = dfs.getDefaultBlockSize(); long used = status.getUsed(); FuseStatfs statFS = new FuseStatfs(); statFS.blockSize = (int) bsize; statFS.blocks = (int) (cap / bsize); statFS.blocksFree = (int) ((cap - used) / bsize); statFS.blocksAvail = (int) ((cap - used) / bsize); statFS.files = 1000; statFS.filesFree = 500; statFS.namelen = 1023; return statFS; } catch (Exception e) { e.printStackTrace(); return null; } }
From source file:org.apache.falcon.service.SharedLibraryHostingService.java
License:Apache License
private FileSystem getFilesystem(final Cluster cluster) throws FalconException { FileSystem fs; try {/*from w ww .j a v a 2s . co m*/ LOG.info("Initializing FS: {} for cluster: {}", ClusterHelper.getStorageUrl(cluster), cluster.getName()); fs = HadoopClientFactory.get().createFalconFileSystem(ClusterHelper.getConfiguration(cluster)); fs.getStatus(); return fs; } catch (Exception e) { throw new FalconException("Failed to initialize FS for cluster : " + cluster.getName(), e); } }
From source file:org.apache.sysml.runtime.controlprogram.parfor.opt.OptimizerRuleBased.java
License:Apache License
/** * Increasing the partition replication factor is beneficial if partitions are * read multiple times (e.g., in nested loops) because partitioning (done once) * gets slightly slower but there is a higher probability for local access * /*from w w w . j a v a2 s. co m*/ * NOTE: this rewrite requires 'set data partitioner' to be executed in order to * leverage the partitioning information in the plan tree. * * @param n internal representation of a plan alternative for program blocks and instructions * @param partitionedMatrices map of data partition formats * @param vars local variable map * @throws DMLRuntimeException if DMLRuntimeException occurs */ protected void rewriteSetPartitionReplicationFactor(OptNode n, HashMap<String, PartitionFormat> partitionedMatrices, LocalVariableMap vars) throws DMLRuntimeException { boolean apply = false; double sizeReplicated = 0; int replication = ParForProgramBlock.WRITE_REPLICATION_FACTOR; ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping() .getMappedProg(n.getID())[1]; if (((n.getExecType() == ExecType.MR && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_MR.name())) || (n.getExecType() == ExecType.SPARK && n.getParam(ParamType.DATA_PARTITIONER).equals(PDataPartitioner.REMOTE_SPARK.name()))) && n.hasNestedParallelism(false) && n.hasNestedPartitionReads(false)) { apply = true; //account for problem and cluster constraints replication = (int) Math.min(_N, _rnk); //account for internal max constraint (note hadoop will warn if max > 10) replication = (int) Math.min(replication, MAX_REPLICATION_FACTOR_PARTITIONING); //account for remaining hdfs capacity try { FileSystem fs = FileSystem.get(ConfigurationManager.getCachedJobConf()); long hdfsCapacityRemain = fs.getStatus().getRemaining(); long sizeInputs = 0; //sum of all input sizes (w/o replication) for (String var : partitionedMatrices.keySet()) { MatrixObject mo = (MatrixObject) vars.get(var); Path fname = new Path(mo.getFileName()); if (fs.exists(fname)) //non-existing (e.g., CP) -> small file sizeInputs += fs.getContentSummary(fname).getLength(); } replication = (int) Math.min(replication, Math.floor(0.9 * hdfsCapacityRemain / sizeInputs)); //ensure at least replication 1 replication = Math.max(replication, ParForProgramBlock.WRITE_REPLICATION_FACTOR); sizeReplicated = replication * sizeInputs; } catch (Exception ex) { throw new DMLRuntimeException("Failed to analyze remaining hdfs capacity.", ex); } } //modify the runtime plan if (apply) pfpb.setPartitionReplicationFactor(replication); _numEvaluatedPlans++; LOG.debug(getOptMode() + " OPT: rewrite 'set partition replication factor' - result=" + apply + ((apply) ? " (" + replication + ", " + toMB(sizeReplicated) + ")" : "")); }