List of usage examples for org.apache.hadoop.fs FileSystem getContentSummary
public ContentSummary getContentSummary(Path f) throws IOException
From source file:org.apache.tajo.storage.AbstractStorageManager.java
License:Apache License
public long calculateSize(Path tablePath) throws IOException { FileSystem fs = tablePath.getFileSystem(conf); long totalSize = 0; if (fs.exists(tablePath)) { totalSize = fs.getContentSummary(tablePath).getLength(); }// w w w . j av a 2 s . co m return totalSize; }
From source file:org.apache.tajo.storage.FileTablespace.java
License:Apache License
/** * Finalizes result data. Tajo stores result data in the staging directory. * If the query fails, clean up the staging directory. * Otherwise the query is successful, move to the final directory from the staging directory. * * @param queryContext The query property * @param changeFileSeq If true change result file name with max sequence. * @return Saved path//from w w w . j a v a 2 s. c o m * @throws java.io.IOException */ protected Path commitOutputData(OverridableConf queryContext, boolean changeFileSeq) throws IOException { Path stagingDir = new Path(queryContext.get(QueryVars.STAGING_DIR)); Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME); Path finalOutputDir; if (!queryContext.get(QueryVars.OUTPUT_TABLE_URI, "").isEmpty()) { finalOutputDir = new Path(queryContext.get(QueryVars.OUTPUT_TABLE_URI)); try { FileSystem fs = stagingResultDir.getFileSystem(conf); if (queryContext.getBool(QueryVars.OUTPUT_OVERWRITE, false)) { // INSERT OVERWRITE INTO // It moves the original table into the temporary location. // Then it moves the new result table into the original table location. // Upon failed, it recovers the original table if possible. boolean movedToOldTable = false; boolean committed = false; Path oldTableDir = new Path(stagingDir, TajoConstants.INSERT_OVERWIRTE_OLD_TABLE_NAME); ContentSummary summary = fs.getContentSummary(stagingResultDir); // When inserting empty data into a partitioned table, check if keep existing data need to be remove or not. boolean overwriteEnabled = queryContext .getBool(SessionVars.PARTITION_NO_RESULT_OVERWRITE_ENABLED); // If existing data doesn't need to keep, check if there are some files. if ((!queryContext.get(QueryVars.OUTPUT_PARTITIONS, "").isEmpty()) && (!overwriteEnabled || (overwriteEnabled && summary.getFileCount() > 0L))) { // This is a map for existing non-leaf directory to rename. A key is current directory and a value is // renaming directory. Map<Path, Path> renameDirs = TUtil.newHashMap(); // This is a map for recovering existing partition directory. A key is current directory and a value is // temporary directory to back up. Map<Path, Path> recoveryDirs = TUtil.newHashMap(); try { if (!fs.exists(finalOutputDir)) { fs.mkdirs(finalOutputDir); } visitPartitionedDirectory(fs, stagingResultDir, finalOutputDir, stagingResultDir.toString(), renameDirs, oldTableDir); // Rename target partition directories for (Map.Entry<Path, Path> entry : renameDirs.entrySet()) { // Backup existing data files for recovering if (fs.exists(entry.getValue())) { String recoveryPathString = entry.getValue().toString() .replaceAll(finalOutputDir.toString(), oldTableDir.toString()); Path recoveryPath = new Path(recoveryPathString); fs.rename(entry.getValue(), recoveryPath); fs.exists(recoveryPath); recoveryDirs.put(entry.getValue(), recoveryPath); } // Delete existing directory fs.delete(entry.getValue(), true); // Rename staging directory to final output directory fs.rename(entry.getKey(), entry.getValue()); } } catch (IOException ioe) { // Remove created dirs for (Map.Entry<Path, Path> entry : renameDirs.entrySet()) { fs.delete(entry.getValue(), true); } // Recovery renamed dirs for (Map.Entry<Path, Path> entry : recoveryDirs.entrySet()) { fs.delete(entry.getValue(), true); fs.rename(entry.getValue(), entry.getKey()); } throw new IOException(ioe.getMessage()); } } else { // no partition try { // if the final output dir exists, move all contents to the temporary table dir. // Otherwise, just make the final output dir. As a result, the final output dir will be empty. if (fs.exists(finalOutputDir)) { fs.mkdirs(oldTableDir); for (FileStatus status : fs.listStatus(finalOutputDir, hiddenFileFilter)) { fs.rename(status.getPath(), oldTableDir); } movedToOldTable = fs.exists(oldTableDir); } else { // if the parent does not exist, make its parent directory. fs.mkdirs(finalOutputDir); } // Move the results to the final output dir. for (FileStatus status : fs.listStatus(stagingResultDir)) { fs.rename(status.getPath(), finalOutputDir); } // Check the final output dir committed = fs.exists(finalOutputDir); } catch (IOException ioe) { // recover the old table if (movedToOldTable && !committed) { // if commit is failed, recover the old data for (FileStatus status : fs.listStatus(finalOutputDir, hiddenFileFilter)) { fs.delete(status.getPath(), true); } for (FileStatus status : fs.listStatus(oldTableDir)) { fs.rename(status.getPath(), finalOutputDir); } } throw new IOException(ioe.getMessage()); } } } else { String queryType = queryContext.get(QueryVars.COMMAND_TYPE); if (queryType != null && queryType.equals(NodeType.INSERT.name())) { // INSERT INTO an existing table NumberFormat fmt = NumberFormat.getInstance(); fmt.setGroupingUsed(false); fmt.setMinimumIntegerDigits(3); if (!queryContext.get(QueryVars.OUTPUT_PARTITIONS, "").isEmpty()) { for (FileStatus eachFile : fs.listStatus(stagingResultDir)) { if (eachFile.isFile()) { LOG.warn("Partition table can't have file in a staging dir: " + eachFile.getPath()); continue; } moveResultFromStageToFinal(fs, stagingResultDir, eachFile, finalOutputDir, fmt, -1, changeFileSeq); } } else { int maxSeq = StorageUtil.getMaxFileSequence(fs, finalOutputDir, false) + 1; for (FileStatus eachFile : fs.listStatus(stagingResultDir)) { if (eachFile.getPath().getName().startsWith("_")) { continue; } moveResultFromStageToFinal(fs, stagingResultDir, eachFile, finalOutputDir, fmt, maxSeq++, changeFileSeq); } } // checking all file moved and remove empty dir verifyAllFileMoved(fs, stagingResultDir); FileStatus[] files = fs.listStatus(stagingResultDir); if (files != null && files.length != 0) { for (FileStatus eachFile : files) { LOG.error("There are some unmoved files in staging dir:" + eachFile.getPath()); } } } else { // CREATE TABLE AS SELECT (CTAS) if (fs.exists(finalOutputDir)) { for (FileStatus status : fs.listStatus(stagingResultDir)) { fs.rename(status.getPath(), finalOutputDir); } } else { fs.rename(stagingResultDir, finalOutputDir); } LOG.info("Moved from the staging dir to the output directory '" + finalOutputDir); } } // remove the staging directory if the final output dir is given. Path stagingDirRoot = stagingDir.getParent(); fs.delete(stagingDirRoot, true); } catch (Throwable t) { LOG.error(t); throw new IOException(t); } } else { finalOutputDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME); } return finalOutputDir; }
From source file:org.apache.tajo.storage.StorageManager.java
License:Apache License
/** * Finalizes result data. Tajo stores result data in the staging directory. * If the query fails, clean up the staging directory. * Otherwise the query is successful, move to the final directory from the staging directory. * * @param queryContext The query property * @param finalEbId The final execution block id * @param plan The query plan/*ww w. j a v a2 s . c om*/ * @param schema The final output schema * @param tableDesc The description of the target table * @param changeFileSeq If true change result file name with max sequence. * @return Saved path * @throws java.io.IOException */ protected Path commitOutputData(OverridableConf queryContext, ExecutionBlockId finalEbId, LogicalPlan plan, Schema schema, TableDesc tableDesc, boolean changeFileSeq) throws IOException { Path stagingDir = new Path(queryContext.get(QueryVars.STAGING_DIR)); Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME); Path finalOutputDir; if (!queryContext.get(QueryVars.OUTPUT_TABLE_PATH, "").isEmpty()) { finalOutputDir = new Path(queryContext.get(QueryVars.OUTPUT_TABLE_PATH)); try { FileSystem fs = stagingResultDir.getFileSystem(conf); if (queryContext.getBool(QueryVars.OUTPUT_OVERWRITE, false)) { // INSERT OVERWRITE INTO // It moves the original table into the temporary location. // Then it moves the new result table into the original table location. // Upon failed, it recovers the original table if possible. boolean movedToOldTable = false; boolean committed = false; Path oldTableDir = new Path(stagingDir, TajoConstants.INSERT_OVERWIRTE_OLD_TABLE_NAME); ContentSummary summary = fs.getContentSummary(stagingResultDir); if (!queryContext.get(QueryVars.OUTPUT_PARTITIONS, "").isEmpty() && summary.getFileCount() > 0L) { // This is a map for existing non-leaf directory to rename. A key is current directory and a value is // renaming directory. Map<Path, Path> renameDirs = TUtil.newHashMap(); // This is a map for recovering existing partition directory. A key is current directory and a value is // temporary directory to back up. Map<Path, Path> recoveryDirs = TUtil.newHashMap(); try { if (!fs.exists(finalOutputDir)) { fs.mkdirs(finalOutputDir); } visitPartitionedDirectory(fs, stagingResultDir, finalOutputDir, stagingResultDir.toString(), renameDirs, oldTableDir); // Rename target partition directories for (Map.Entry<Path, Path> entry : renameDirs.entrySet()) { // Backup existing data files for recovering if (fs.exists(entry.getValue())) { String recoveryPathString = entry.getValue().toString() .replaceAll(finalOutputDir.toString(), oldTableDir.toString()); Path recoveryPath = new Path(recoveryPathString); fs.rename(entry.getValue(), recoveryPath); fs.exists(recoveryPath); recoveryDirs.put(entry.getValue(), recoveryPath); } // Delete existing directory fs.delete(entry.getValue(), true); // Rename staging directory to final output directory fs.rename(entry.getKey(), entry.getValue()); } } catch (IOException ioe) { // Remove created dirs for (Map.Entry<Path, Path> entry : renameDirs.entrySet()) { fs.delete(entry.getValue(), true); } // Recovery renamed dirs for (Map.Entry<Path, Path> entry : recoveryDirs.entrySet()) { fs.delete(entry.getValue(), true); fs.rename(entry.getValue(), entry.getKey()); } throw new IOException(ioe.getMessage()); } } else { // no partition try { // if the final output dir exists, move all contents to the temporary table dir. // Otherwise, just make the final output dir. As a result, the final output dir will be empty. if (fs.exists(finalOutputDir)) { fs.mkdirs(oldTableDir); for (FileStatus status : fs.listStatus(finalOutputDir, StorageManager.hiddenFileFilter)) { fs.rename(status.getPath(), oldTableDir); } movedToOldTable = fs.exists(oldTableDir); } else { // if the parent does not exist, make its parent directory. fs.mkdirs(finalOutputDir); } // Move the results to the final output dir. for (FileStatus status : fs.listStatus(stagingResultDir)) { fs.rename(status.getPath(), finalOutputDir); } // Check the final output dir committed = fs.exists(finalOutputDir); } catch (IOException ioe) { // recover the old table if (movedToOldTable && !committed) { // if commit is failed, recover the old data for (FileStatus status : fs.listStatus(finalOutputDir, StorageManager.hiddenFileFilter)) { fs.delete(status.getPath(), true); } for (FileStatus status : fs.listStatus(oldTableDir)) { fs.rename(status.getPath(), finalOutputDir); } } throw new IOException(ioe.getMessage()); } } } else { String queryType = queryContext.get(QueryVars.COMMAND_TYPE); if (queryType != null && queryType.equals(NodeType.INSERT.name())) { // INSERT INTO an existing table NumberFormat fmt = NumberFormat.getInstance(); fmt.setGroupingUsed(false); fmt.setMinimumIntegerDigits(3); if (!queryContext.get(QueryVars.OUTPUT_PARTITIONS, "").isEmpty()) { for (FileStatus eachFile : fs.listStatus(stagingResultDir)) { if (eachFile.isFile()) { LOG.warn("Partition table can't have file in a staging dir: " + eachFile.getPath()); continue; } moveResultFromStageToFinal(fs, stagingResultDir, eachFile, finalOutputDir, fmt, -1, changeFileSeq); } } else { int maxSeq = StorageUtil.getMaxFileSequence(fs, finalOutputDir, false) + 1; for (FileStatus eachFile : fs.listStatus(stagingResultDir)) { if (eachFile.getPath().getName().startsWith("_")) { continue; } moveResultFromStageToFinal(fs, stagingResultDir, eachFile, finalOutputDir, fmt, maxSeq++, changeFileSeq); } } // checking all file moved and remove empty dir verifyAllFileMoved(fs, stagingResultDir); FileStatus[] files = fs.listStatus(stagingResultDir); if (files != null && files.length != 0) { for (FileStatus eachFile : files) { LOG.error("There are some unmoved files in staging dir:" + eachFile.getPath()); } } } else { // CREATE TABLE AS SELECT (CTAS) if (fs.exists(finalOutputDir)) { for (FileStatus status : fs.listStatus(stagingResultDir)) { fs.rename(status.getPath(), finalOutputDir); } } else { fs.rename(stagingResultDir, finalOutputDir); } LOG.info("Moved from the staging dir to the output directory '" + finalOutputDir); } } // remove the staging directory if the final output dir is given. Path stagingDirRoot = stagingDir.getParent(); fs.delete(stagingDirRoot, true); } catch (Throwable t) { LOG.error(t); throw new IOException(t); } } else { finalOutputDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME); } return finalOutputDir; }
From source file:org.bgi.flexlab.gaeatools.sortvcf.SortVcf.java
License:Open Source License
public int run(String[] args) throws Exception { final Configuration conf = getConf(); SortVcfOptions options = new SortVcfOptions(args); conf.set(VCFOutputFormat.OUTPUT_VCF_FORMAT_PROPERTY, options.getOutputFormat()); conf.setBoolean("hadoopbam.vcf.write-header", false); Path inputPath = new Path(options.getInput()); FileSystem fs = inputPath.getFileSystem(conf); FileStatus[] files = fs.listStatus(inputPath); Path vcfHeaderPath = files[0].getPath(); if (options.getVcfHeader() != null) vcfHeaderPath = new Path(options.getVcfHeader()); if (files.length <= 0) { System.err.println("Input dir is empty!"); return 1; }//from w ww .j a v a 2s. co m conf.set(MyVCFOutputFormat.INPUT_PATH_PROP, vcfHeaderPath.toString()); conf.set("io.compression.codecs", BGZFCodec.class.getCanonicalName()); KeyIgnoringVCFOutputFormat<Text> baseOF = new KeyIgnoringVCFOutputFormat<>(conf); baseOF.readHeaderFrom(vcfHeaderPath, vcfHeaderPath.getFileSystem(conf)); VCFHeader vcfHeader = baseOF.getHeader(); Job job = Job.getInstance(conf, "VCFSort"); job.setJarByClass(SortVcf.class); job.setMapperClass(Mapper.class); job.setReducerClass(SortVcfReducer.class); job.setMapOutputKeyClass(LongWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(VariantContextWritable.class); job.setInputFormatClass(VCFInputFormat.class); job.setOutputFormatClass(MyVCFOutputFormat.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setNumReduceTasks(options.getReducerNum()); SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmmss"); String tmpDir = "/user/" + System.getProperty("user.name") + "/vcfsorttmp-" + df.format(new Date()); Path partTmp = new Path(tmpDir + "/temp"); VCFInputFormat.addInputPath(job, inputPath); if (MAX_SPLIT_SIZE < VCFInputFormat.getMaxSplitSize(job)) VCFInputFormat.setMaxInputSplitSize(job, MAX_SPLIT_SIZE); FileOutputFormat.setOutputPath(job, partTmp); FileOutputFormat.setCompressOutput(job, true); FileOutputFormat.setOutputCompressorClass(job, BGZFCodec.class); Path partitionFile; if (options.getPartitionFileString() == null) { partitionFile = new Path(tmpDir + "/_partitons.lst"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); System.out.println("vcf-sort :: Sampling..."); int numSamples = options.getNumSamples(); if (fs.getContentSummary(inputPath).getLength() < 10000000) { numSamples = 1; job.setNumReduceTasks(1); } InputSampler.writePartitionFile(job, new InputSampler.RandomSampler<LongWritable, VariantContextWritable>(0.001, numSamples, numSamples)); } else { System.out.println("vcf-sort :: use partitionFile:" + options.getPartitionFileString() + " ..."); partitionFile = new Path(options.getPartitionFileString()); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); } if (!job.waitForCompletion(true)) { System.err.println("sort :: Job failed."); return 1; } final FileSystem srcFS = partTmp.getFileSystem(conf); Path headerPath = new Path(tmpDir + "/header.vcf.gz"); BGZFCodec bgzfCodec = new BGZFCodec(); OutputStream os = bgzfCodec.createOutputStream(srcFS.create(headerPath)); VariantContextWriterBuilder builder = new VariantContextWriterBuilder(); VariantContextWriter writer; writer = builder.setOutputVCFStream(new FilterOutputStream(os) { @Override public void close() throws IOException { this.out.flush(); } }).setOptions(VariantContextWriterBuilder.NO_OPTIONS).build(); writer.writeHeader(vcfHeader); os.close(); Path outputPath = new Path(options.getOutput()); final FileSystem dstFS = outputPath.getFileSystem(conf); OutputStream vcfgz = dstFS.create(outputPath); final FSDataInputStream headerIns = srcFS.open(headerPath); IOUtils.copyBytes(headerIns, vcfgz, conf, false); headerIns.close(); final FileStatus[] parts = partTmp.getFileSystem(conf) .globStatus(new Path(partTmp.toString() + "/part-*-[0-9][0-9][0-9][0-9][0-9]*")); for (FileStatus p : parts) { final FSDataInputStream ins = srcFS.open(p.getPath()); IOUtils.copyBytes(ins, vcfgz, conf, false); ins.close(); } vcfgz.write(BlockCompressedStreamConstants.EMPTY_GZIP_BLOCK); vcfgz.close(); partTmp.getFileSystem(conf).delete(partTmp, true); return 0; }
From source file:org.lab41.HdfsUtil.java
License:Apache License
public static long getSizeOfDirectory(Path path, Configuration configuration) throws IOException { //Get the file size of the unannotated Edges FileSystem fileSystem = FileSystem.get(configuration); long size = fileSystem.getContentSummary(path).getLength(); return size;/*from w ww. ja va2 s. c o m*/ }
From source file:org.opencloudengine.garuda.backend.hdfs.HdfsServiceImpl.java
License:Open Source License
@Override public void downloadFile(String path, HttpServletResponse response) throws Exception { this.mustExists(path); FileSystem fs = fileSystemFactory.getFileSystem(); Path fsPath = new Path(path); FileStatus fileStatus = fs.getFileStatus(fsPath); if (!fileStatus.isFile()) { this.notFileException(fsPath.toString()); }//from ww w . j a v a 2 s. c om HdfsFileInfo fileInfo = new HdfsFileInfo(fileStatus, fs.getContentSummary(fsPath)); FSDataInputStream in = fs.open(fsPath); String filename = fileInfo.getFilename(); response.setHeader("Content-Length", "" + fileInfo.getLength()); response.setHeader("Content-Transfer-Encoding", "binary"); response.setHeader("Content-Type", "application/force-download"); response.setHeader("Content-Disposition", MessageFormatter .format("attachment; fullyQualifiedPath={}; filename={};", URLEncoder.encode(fileInfo.getFullyQualifiedPath(), "UTF-8"), filename) .getMessage()); response.setStatus(200); ServletOutputStream out = response.getOutputStream(); byte[] b = new byte[1024]; int numBytes = 0; while ((numBytes = in.read(b)) > 0) { out.write(b, 0, numBytes); } in.close(); out.close(); fs.close(); }
From source file:org.opencloudengine.garuda.backend.hdfs.HdfsServiceImpl.java
License:Open Source License
@Override public HdfsListInfo list(String path, int start, int end, final String filter) throws Exception { HdfsListInfo hdfsListInfo = new HdfsListInfo(); this.indexCheck(start, end); this.mustExists(path); FileSystem fs = fileSystemFactory.getFileSystem(); Path fsPath = new Path(path); FileStatus fileStatus = fs.getFileStatus(fsPath); if (!fileStatus.isDirectory()) { this.notDirectoryException(fsPath.toString()); }/* www. j a v a 2s .co m*/ List<HdfsFileInfo> listStatus = new ArrayList<>(); int count = 0; FileStatus fileStatuses = null; LocatedFileStatus next = null; RemoteIterator<LocatedFileStatus> remoteIterator = fs.listLocatedStatus(fsPath); while (remoteIterator.hasNext()) { next = remoteIterator.next(); if (!StringUtils.isEmpty(filter)) { if (next.getPath().getName().contains(filter)) { count++; if (count >= start && count <= end) { fileStatuses = fs.getFileStatus(next.getPath()); listStatus .add(new HdfsFileInfo(fileStatuses, fs.getContentSummary(fileStatuses.getPath()))); } } } else { count++; if (count >= start && count <= end) { fileStatuses = fs.getFileStatus(next.getPath()); listStatus.add(new HdfsFileInfo(fileStatuses, fs.getContentSummary(fileStatuses.getPath()))); } } } hdfsListInfo.setFileInfoList(listStatus); hdfsListInfo.setCount(count); return hdfsListInfo; }
From source file:org.opencloudengine.garuda.backend.hdfs.HdfsServiceImpl.java
License:Open Source License
@Override public HdfsFileInfo getStatus(String path) throws Exception { this.mustExists(path); FileSystem fs = fileSystemFactory.getFileSystem(); Path fsPath = new Path(path); FileStatus fileStatus = fs.getFileStatus(fsPath); ContentSummary summary = fs.getContentSummary(fsPath); return new HdfsFileInfo(fileStatus, summary); }
From source file:org.opencloudengine.garuda.backend.hdfs.HdfsServiceImpl.java
License:Open Source License
private Path _rename(String path, String rename) throws Exception { FileSystem fs = fileSystemFactory.getFileSystem(); Path fsPath = new Path(path); FileStatus fileStatus = fs.getFileStatus(fsPath); HdfsFileInfo hdfsFileInfo = new HdfsFileInfo(fileStatus, fs.getContentSummary(fsPath)); String parentPath = hdfsFileInfo.getPath(); String newPath = parentPath + "/" + rename; Path path1 = new Path(newPath); if (StringUtils.isEmpty(rename)) { logger.warn("Failed rename HDFS file, Rename is empty : {}", newPath); throw new ServiceException(" ? ?? ."); }/*from ww w . j a v a 2 s .c om*/ fs.rename(fsPath, path1); fs.close(); return path1; }
From source file:org.pentaho.di.job.entries.hadooptransjobexecutor.DistributedCacheUtilTest.java
License:Apache License
/** * Utility to attempt to stage a file to HDFS for use with Distributed Cache. * * @param ch Distributed Cache Helper * @param source File or directory to stage * @param fs FileSystem to stage to * @param root Root directory to clean up when this test is complete * @param dest Destination path to stage to * @param expectedFileCount Expected number of files to exist in the destination once staged * @param expectedDirCount Expected number of directories to exist in the destiation once staged * @throws Exception/* w w w . java 2 s . c o m*/ */ private void stageForCacheTester(DistributedCacheUtil ch, FileObject source, FileSystem fs, Path root, Path dest, int expectedFileCount, int expectedDirCount) throws Exception { try { ch.stageForCache(source, fs, dest, true); assertTrue(fs.exists(dest)); ContentSummary cs = fs.getContentSummary(dest); assertEquals(expectedFileCount, cs.getFileCount()); assertEquals(expectedDirCount, cs.getDirectoryCount()); assertEquals(FsPermission.createImmutable((short) 0755), fs.getFileStatus(dest).getPermission()); } finally { // Clean up after ourself if (!fs.delete(root, true)) { System.err.println("error deleting FileSystem temp dir " + root); } } }