List of usage examples for org.apache.hadoop.fs ContentSummary getFileCount
public long getFileCount()
From source file:com.bigstep.datalake.JsonUtil.java
License:Apache License
/** Convert a ContentSummary to a Json string. */ public static String toJsonString(final ContentSummary contentsummary) { if (contentsummary == null) { return null; }//from ww w . ja v a2 s.co m final Map<String, Object> m = new TreeMap<String, Object>(); m.put("length", contentsummary.getLength()); m.put("fileCount", contentsummary.getFileCount()); m.put("directoryCount", contentsummary.getDirectoryCount()); m.put("quota", contentsummary.getQuota()); m.put("spaceConsumed", contentsummary.getSpaceConsumed()); m.put("spaceQuota", contentsummary.getSpaceQuota()); return toJsonString(ContentSummary.class, m); }
From source file:com.cloudera.sqoop.TestTargetDir.java
License:Apache License
/** test target-dir contains imported files. */ public void testTargetDir() throws IOException { try {//from w w w . j a v a 2s . c om String targetDir = getWarehouseDir() + "/tempTargetDir"; ArrayList args = getOutputArgv(true); args.add("--target-dir"); args.add(targetDir); // delete target-dir if exists and recreate it FileSystem fs = FileSystem.get(getConf()); Path outputPath = new Path(targetDir); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } String[] argv = (String[]) args.toArray(new String[0]); runImport(argv); ContentSummary summ = fs.getContentSummary(outputPath); assertTrue("There's no new imported files in target-dir", summ.getFileCount() > 0); } catch (Exception e) { LOG.error("Got Exception: " + StringUtils.stringifyException(e)); fail(e.toString()); } }
From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java
License:Open Source License
@Test public void testMapFileWrite() throws Exception { Path file = Testfile.MAPFILE.filepath(); logger.debug("Writing {} with {} key/value pairs", file, String.format("%,d", testSize)); startTimer();//from w w w. ja v a 2s.com writeMapFile(); logger.info("Duration: {}", stopTimer(Testfile.MAPFILE, "WRITE")); Assert.assertTrue(hdfs.exists(file)); ContentSummary fileInfo = hdfs.getContentSummary(file); logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()), String.format("%,d", fileInfo.getFileCount())); }
From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java
License:Open Source License
@Test public void testHFileWrite() throws Exception { Path file = Testfile.HFILE.filepath(); logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize)); startTimer();/*www . j a va 2 s .c o m*/ writeHFile(file, Compression.Algorithm.NONE); logger.info("Duration: {}", stopTimer(Testfile.HFILE, "WRITE")); Assert.assertTrue(hdfs.exists(file)); ContentSummary fileInfo = hdfs.getContentSummary(file); logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()), String.format("%,d", fileInfo.getFileCount())); }
From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java
License:Open Source License
@Test public void testHFileWriteGZ() throws Exception { Path file = Testfile.HFILE_GZ.filepath(); logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize)); startTimer();//w w w . ja v a 2s . c o m writeHFile(file, Compression.Algorithm.GZ); logger.info("Duration: {}", stopTimer(Testfile.HFILE_GZ, "WRITE")); Assert.assertTrue(hdfs.exists(file)); ContentSummary fileInfo = hdfs.getContentSummary(file); logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()), String.format("%,d", fileInfo.getFileCount())); }
From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java
License:Open Source License
@Test public void testTFileWrite() throws Exception { Path file = Testfile.TFILE.filepath(); logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize)); startTimer();//from ww w . ja v a 2s. c o m writeTFile(file, TFile.COMPRESSION_NONE); logger.info("Duration: {}", stopTimer(Testfile.TFILE, "WRITE")); Assert.assertTrue(hdfs.exists(file)); ContentSummary fileInfo = hdfs.getContentSummary(file); logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()), String.format("%,d", fileInfo.getFileCount())); }
From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java
License:Open Source License
@Test public void testTFileWriteGZ() throws Exception { Path file = Testfile.TFILE_GZ.filepath(); logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize)); startTimer();//w w w . j av a 2 s . c o m writeTFile(file, TFile.COMPRESSION_GZ); logger.info("Duration: {}", stopTimer(Testfile.TFILE_GZ, "WRITE")); Assert.assertTrue(hdfs.exists(file)); ContentSummary fileInfo = hdfs.getContentSummary(file); logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()), String.format("%,d", fileInfo.getFileCount())); }
From source file:com.twitter.hraven.etl.JobFilePartitioner.java
License:Apache License
/** * @param inputPath// w w w . java 2s. com * @throws IOException */ private void processHDFSSources(Path inputPath) throws IOException { // Try to get the fileStatus only if we're reasonably confident that this // is an HDFS path.s FileStatus inputFileStatus = hdfs.getFileStatus(inputPath); // Check if input is a directory if (!inputFileStatus.isDir()) { throw new IOException("Input is not a directory in HDFS: " + input); } // Accept only jobFiles and only those that fall in the desired range of // modification time. JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter(myConf, 0L); ContentSummary contentSummary = hdfs.getContentSummary(inputPath); LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath); // get the files in the done folder, // need to traverse dirs under done recursively for versions // that include MAPREDUCE-323: on/after hadoop 0.20.203.0 // on/after cdh3u5 FileStatus[] jobFileStatusses = FileLister.listFiles(true, hdfs, inputPath, jobFileModifiedRangePathFilter); LOG.info("Sorting " + jobFileStatusses.length + " job files."); Arrays.sort(jobFileStatusses, new FileStatusModificationComparator()); int processedCount = 0; try { for (int i = 0; i < jobFileStatusses.length; i++) { FileStatus jobFileStatus = jobFileStatusses[i]; boolean retain = BatchUtil.shouldRetain(i, maXretention, jobFileStatusses.length); processHDFSSource(hdfs, jobFileStatus, outputPath, myConf, skipExisting, retain); processedCount++; // Print something each 1k files to show progress. if ((i % 1000) == 0) { LOG.info("Processed " + i + " files."); } } } finally { LOG.info("Processed " + processedCount + " files."); } }
From source file:com.twitter.hraven.etl.JobFilePreprocessor.java
License:Apache License
@Override public int run(String[] args) throws Exception { // When we started processing. This is also the upper limit of files we // accept, next run will pick up the new incoming files. long processingStartMillis = System.currentTimeMillis(); Configuration hbaseConf = HBaseConfiguration.create(getConf()); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Output should be an hdfs path. FileSystem hdfs = FileSystem.get(hbaseConf); // Grab the input path argument String output = commandLine.getOptionValue("o"); LOG.info(" output=" + output); Path outputPath = new Path(output); FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); if (!outputFileStatus.isDir()) { throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName()); }/*from w ww .j a v a2 s. c o m*/ // Grab the input path argument String input; if (commandLine.hasOption("i")) { input = commandLine.getOptionValue("i"); } else { input = hbaseConf.get("mapred.job.tracker.history.completed.location"); } LOG.info("input=" + input); // Grab the batch-size argument int batchSize; if (commandLine.hasOption("b")) { try { batchSize = Integer.parseInt(commandLine.getOptionValue("b")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe); } // Additional check if (batchSize < 1) { throw new IllegalArgumentException( "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + commandLine.getOptionValue("b")); } } else { batchSize = DEFAULT_BATCH_SIZE; } boolean forceAllFiles = commandLine.hasOption("f"); LOG.info("forceAllFiles: " + forceAllFiles); Path inputPath = new Path(input); FileStatus inputFileStatus = hdfs.getFileStatus(inputPath); if (!inputFileStatus.isDir()) { throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName()); } // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); /** * Grab the size of huge files to be moved argument * hbase cell can't store files bigger than * maxFileSize, hence no need to consider them for rawloading * Reference: * {@link https://github.com/twitter/hraven/issues/59} */ String maxFileSizeStr = commandLine.getOptionValue("s"); LOG.info("maxFileSize=" + maxFileSizeStr); long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT; try { maxFileSize = Long.parseLong(maxFileSizeStr); } catch (NumberFormatException nfe) { throw new ProcessingException( "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe); } ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf); boolean success = true; try { // Figure out where we last left off (if anywhere at all) ProcessRecord lastProcessRecord = null; if (!forceAllFiles) { lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster); } long minModificationTimeMillis = 0; if (lastProcessRecord != null) { // Start of this time period is the end of the last period. minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis(); } // Do a sanity check. The end time of the last scan better not be later // than when we started processing. if (minModificationTimeMillis > processingStartMillis) { throw new RuntimeException("The last processing record has maxModificationMillis later than now: " + lastProcessRecord); } // Accept only jobFiles and only those that fall in the desired range of // modification time. JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter( hbaseConf, minModificationTimeMillis); String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis)); ContentSummary contentSummary = hdfs.getContentSummary(inputPath); LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath + " that are modified since " + timestamp); // get the files in the done folder, // need to traverse dirs under done recursively for versions // that include MAPREDUCE-323: on/after hadoop 0.20.203.0 // on/after cdh3u5 FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath, jobFileModifiedRangePathFilter); LOG.info("Sorting " + jobFileStatusses.length + " job files."); Arrays.sort(jobFileStatusses, new FileStatusModificationComparator()); // Process these files in batches at a time. int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize); LOG.info("Batch count: " + batchCount); for (int b = 0; b < batchCount; b++) { processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath); } } finally { processRecordService.close(); } Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass()); if (statistics != null) { LOG.info("HDFS bytes read: " + statistics.getBytesRead()); LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); LOG.info("HDFS read ops: " + statistics.getReadOps()); LOG.info("HDFS large read ops: " + statistics.getLargeReadOps()); LOG.info("HDFS write ops: " + statistics.getWriteOps()); } // Return the status return success ? 0 : 1; }
From source file:org.apache.ignite.igfs.IgfsHadoopFileSystemAbstractSelfTest.java
License:Apache License
/** * Compare content of two folders.// www . j a v a 2 s . c o m * * @param cfg Paths configuration to compare. * @throws IOException If failed. */ @SuppressWarnings("deprecation") private void compareContent(Config cfg) throws IOException { Deque<Config> queue = new LinkedList<>(); queue.add(cfg); for (Config c = queue.poll(); c != null; c = queue.poll()) { boolean exists; assertEquals("Check existence [src=" + c.src + ", dest=" + c.dest + ']', exists = c.srcFs.exists(c.src), c.destFs.exists(c.dest)); assertEquals("Check types (files?) [src=" + c.src + ", dest=" + c.dest + ']', c.srcFs.isFile(c.src), c.destFs.isFile(c.dest)); if (exists) { ContentSummary srcSummary = c.srcFs.getContentSummary(c.src); ContentSummary dstSummary = c.destFs.getContentSummary(c.dest); assertEquals("Directories number comparison failed", srcSummary.getDirectoryCount(), dstSummary.getDirectoryCount()); assertEquals("Files number comparison failed", srcSummary.getFileCount(), dstSummary.getFileCount()); assertEquals("Space consumed comparison failed", srcSummary.getSpaceConsumed(), dstSummary.getSpaceConsumed()); assertEquals("Length comparison failed", srcSummary.getLength(), dstSummary.getLength()); // Intentionally skipping quotas checks as they can vary. } else { assertContentSummaryFails(c.srcFs, c.src); assertContentSummaryFails(c.destFs, c.dest); } if (!exists) continue; FileStatus[] srcSt = c.srcFs.listStatus(c.src); FileStatus[] destSt = c.destFs.listStatus(c.dest); assert srcSt != null && destSt != null : "Both not null" + " [srcSt=" + Arrays.toString(srcSt) + ", destSt=" + Arrays.toString(destSt) + ']'; assertEquals("Check listing [src=" + c.src + ", dest=" + c.dest + ']', srcSt.length, destSt.length); // Listing of the file returns the only element with this file. if (srcSt.length == 1 && c.src.equals(srcSt[0].getPath())) { assertEquals(c.dest, destSt[0].getPath()); assertTrue("Expects file [src=" + c.src + ", srcSt[0]=" + srcSt[0] + ']', !srcSt[0].isDir()); assertTrue("Expects file [dest=" + c.dest + ", destSt[0]=" + destSt[0] + ']', !destSt[0].isDir()); FSDataInputStream srcIn = null; FSDataInputStream destIn = null; try { srcIn = c.srcFs.open(c.src); destIn = c.destFs.open(c.dest); GridTestIoUtils.assertEqualStreams(srcIn, destIn, srcSt[0].getLen()); } finally { U.closeQuiet(srcIn); U.closeQuiet(destIn); } continue; // Skip the following directories validations. } // Sort both arrays. Arrays.sort(srcSt, STATUS_COMPARATOR); Arrays.sort(destSt, STATUS_COMPARATOR); for (int i = 0; i < srcSt.length; i++) // Dig in deep to the last leaf, instead of collecting full tree in memory. queue.addFirst(new Config(c.srcFs, srcSt[i].getPath(), c.destFs, destSt[i].getPath())); // Add non-existent file to check in the current folder. String rndFile = "Non-existent file #" + UUID.randomUUID().toString(); queue.addFirst(new Config(c.srcFs, new Path(c.src, rndFile), c.destFs, new Path(c.dest, rndFile))); } }