Example usage for org.apache.hadoop.fs ContentSummary getFileCount

Introduction

In this page you can find the example usage for org.apache.hadoop.fs ContentSummary getFileCount.

Prototype

public long getFileCount()

Source Link

Usage

From source file:com.bigstep.datalake.JsonUtil.java

License:Apache License

/** Convert a ContentSummary to a Json string. */
public static String toJsonString(final ContentSummary contentsummary) {
    if (contentsummary == null) {
        return null;
    }//from  ww  w . ja  v  a2 s.co m

    final Map<String, Object> m = new TreeMap<String, Object>();
    m.put("length", contentsummary.getLength());
    m.put("fileCount", contentsummary.getFileCount());
    m.put("directoryCount", contentsummary.getDirectoryCount());
    m.put("quota", contentsummary.getQuota());
    m.put("spaceConsumed", contentsummary.getSpaceConsumed());
    m.put("spaceQuota", contentsummary.getSpaceQuota());
    return toJsonString(ContentSummary.class, m);
}

From source file:com.cloudera.sqoop.TestTargetDir.java

License:Apache License

/** test target-dir contains imported files. */
public void testTargetDir() throws IOException {

    try {//from   w w  w  . j a v a  2s  .  c  om
        String targetDir = getWarehouseDir() + "/tempTargetDir";

        ArrayList args = getOutputArgv(true);
        args.add("--target-dir");
        args.add(targetDir);

        // delete target-dir if exists and recreate it
        FileSystem fs = FileSystem.get(getConf());
        Path outputPath = new Path(targetDir);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }

        String[] argv = (String[]) args.toArray(new String[0]);
        runImport(argv);

        ContentSummary summ = fs.getContentSummary(outputPath);

        assertTrue("There's no new imported files in target-dir", summ.getFileCount() > 0);

    } catch (Exception e) {
        LOG.error("Got Exception: " + StringUtils.stringifyException(e));
        fail(e.toString());
    }
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testMapFileWrite() throws Exception {

    Path file = Testfile.MAPFILE.filepath();
    logger.debug("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//from  w w  w. ja v a 2s.com
    writeMapFile();
    logger.info("Duration: {}", stopTimer(Testfile.MAPFILE, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testHFileWrite() throws Exception {
    Path file = Testfile.HFILE.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();/*www . j  a  va  2  s .c  o  m*/
    writeHFile(file, Compression.Algorithm.NONE);
    logger.info("Duration: {}", stopTimer(Testfile.HFILE, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testHFileWriteGZ() throws Exception {
    Path file = Testfile.HFILE_GZ.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//w  w w .  ja  v a 2s .  c o m
    writeHFile(file, Compression.Algorithm.GZ);
    logger.info("Duration: {}", stopTimer(Testfile.HFILE_GZ, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testTFileWrite() throws Exception {
    Path file = Testfile.TFILE.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//from ww w .  ja  v  a  2s. c o m
    writeTFile(file, TFile.COMPRESSION_NONE);
    logger.info("Duration: {}", stopTimer(Testfile.TFILE, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testTFileWriteGZ() throws Exception {
    Path file = Testfile.TFILE_GZ.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//w w  w  . j  av a 2 s .  c  o  m
    writeTFile(file, TFile.COMPRESSION_GZ);
    logger.info("Duration: {}", stopTimer(Testfile.TFILE_GZ, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.twitter.hraven.etl.JobFilePartitioner.java

License:Apache License

/**
 * @param inputPath//  w  w w .  java 2s. com
 * @throws IOException
 */
private void processHDFSSources(Path inputPath) throws IOException {
    // Try to get the fileStatus only if we're reasonably confident that this
    // is an HDFS path.s
    FileStatus inputFileStatus = hdfs.getFileStatus(inputPath);

    // Check if input is a directory
    if (!inputFileStatus.isDir()) {
        throw new IOException("Input is not a directory in HDFS: " + input);
    }

    // Accept only jobFiles and only those that fall in the desired range of
    // modification time.
    JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter(myConf,
            0L);

    ContentSummary contentSummary = hdfs.getContentSummary(inputPath);
    LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath);

    // get the files in the done folder,
    // need to traverse dirs under done recursively for versions
    // that include MAPREDUCE-323: on/after hadoop 0.20.203.0
    // on/after cdh3u5
    FileStatus[] jobFileStatusses = FileLister.listFiles(true, hdfs, inputPath, jobFileModifiedRangePathFilter);

    LOG.info("Sorting " + jobFileStatusses.length + " job files.");

    Arrays.sort(jobFileStatusses, new FileStatusModificationComparator());

    int processedCount = 0;
    try {

        for (int i = 0; i < jobFileStatusses.length; i++) {
            FileStatus jobFileStatus = jobFileStatusses[i];

            boolean retain = BatchUtil.shouldRetain(i, maXretention, jobFileStatusses.length);
            processHDFSSource(hdfs, jobFileStatus, outputPath, myConf, skipExisting, retain);
            processedCount++;
            // Print something each 1k files to show progress.
            if ((i % 1000) == 0) {
                LOG.info("Processed " + i + " files.");
            }

        }

    } finally {
        LOG.info("Processed " + processedCount + " files.");
    }
}

From source file:com.twitter.hraven.etl.JobFilePreprocessor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    // When we started processing. This is also the upper limit of files we
    // accept, next run will pick up the new incoming files.
    long processingStartMillis = System.currentTimeMillis();

    Configuration hbaseConf = HBaseConfiguration.create(getConf());

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Output should be an hdfs path.
    FileSystem hdfs = FileSystem.get(hbaseConf);

    // Grab the input path argument
    String output = commandLine.getOptionValue("o");
    LOG.info(" output=" + output);
    Path outputPath = new Path(output);
    FileStatus outputFileStatus = hdfs.getFileStatus(outputPath);

    if (!outputFileStatus.isDir()) {
        throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName());
    }/*from w  ww .j  a  v a2 s.  c  o m*/

    // Grab the input path argument
    String input;
    if (commandLine.hasOption("i")) {
        input = commandLine.getOptionValue("i");
    } else {
        input = hbaseConf.get("mapred.job.tracker.history.completed.location");
    }
    LOG.info("input=" + input);

    // Grab the batch-size argument
    int batchSize;
    if (commandLine.hasOption("b")) {
        try {
            batchSize = Integer.parseInt(commandLine.getOptionValue("b"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe);
        }
        // Additional check
        if (batchSize < 1) {
            throw new IllegalArgumentException(
                    "Cannot process files in batches smaller than 1. Specified batch size option -b is: "
                            + commandLine.getOptionValue("b"));
        }
    } else {
        batchSize = DEFAULT_BATCH_SIZE;
    }

    boolean forceAllFiles = commandLine.hasOption("f");
    LOG.info("forceAllFiles: " + forceAllFiles);

    Path inputPath = new Path(input);
    FileStatus inputFileStatus = hdfs.getFileStatus(inputPath);

    if (!inputFileStatus.isDir()) {
        throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName());
    }

    // Grab the cluster argument
    String cluster = commandLine.getOptionValue("c");
    LOG.info("cluster=" + cluster);

    /**
     * Grab the size of huge files to be moved argument
     * hbase cell can't store files bigger than
     * maxFileSize, hence no need to consider them for rawloading
     * Reference:
     * {@link https://github.com/twitter/hraven/issues/59}
     */
    String maxFileSizeStr = commandLine.getOptionValue("s");
    LOG.info("maxFileSize=" + maxFileSizeStr);
    long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT;
    try {
        maxFileSize = Long.parseLong(maxFileSizeStr);
    } catch (NumberFormatException nfe) {
        throw new ProcessingException(
                "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe);
    }

    ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf);

    boolean success = true;
    try {

        // Figure out where we last left off (if anywhere at all)
        ProcessRecord lastProcessRecord = null;

        if (!forceAllFiles) {
            lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster);
        }

        long minModificationTimeMillis = 0;
        if (lastProcessRecord != null) {
            // Start of this time period is the end of the last period.
            minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis();
        }

        // Do a sanity check. The end time of the last scan better not be later
        // than when we started processing.
        if (minModificationTimeMillis > processingStartMillis) {
            throw new RuntimeException("The last processing record has maxModificationMillis later than now: "
                    + lastProcessRecord);
        }

        // Accept only jobFiles and only those that fall in the desired range of
        // modification time.
        JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter(
                hbaseConf, minModificationTimeMillis);

        String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis));

        ContentSummary contentSummary = hdfs.getContentSummary(inputPath);
        LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath
                + " that are modified since " + timestamp);

        // get the files in the done folder,
        // need to traverse dirs under done recursively for versions
        // that include MAPREDUCE-323: on/after hadoop 0.20.203.0
        // on/after cdh3u5
        FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath,
                jobFileModifiedRangePathFilter);

        LOG.info("Sorting " + jobFileStatusses.length + " job files.");

        Arrays.sort(jobFileStatusses, new FileStatusModificationComparator());

        // Process these files in batches at a time.
        int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize);
        LOG.info("Batch count: " + batchCount);
        for (int b = 0; b < batchCount; b++) {
            processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath);
        }

    } finally {
        processRecordService.close();
    }

    Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass());
    if (statistics != null) {
        LOG.info("HDFS bytes read: " + statistics.getBytesRead());
        LOG.info("HDFS bytes written: " + statistics.getBytesWritten());
        LOG.info("HDFS read ops: " + statistics.getReadOps());
        LOG.info("HDFS large read ops: " + statistics.getLargeReadOps());
        LOG.info("HDFS write ops: " + statistics.getWriteOps());
    }

    // Return the status
    return success ? 0 : 1;
}

From source file:org.apache.ignite.igfs.IgfsHadoopFileSystemAbstractSelfTest.java

License:Apache License

/**
 * Compare content of two folders.// www . j  a v  a  2  s .  c  o  m
 *
 * @param cfg Paths configuration to compare.
 * @throws IOException If failed.
 */
@SuppressWarnings("deprecation")
private void compareContent(Config cfg) throws IOException {
    Deque<Config> queue = new LinkedList<>();

    queue.add(cfg);

    for (Config c = queue.poll(); c != null; c = queue.poll()) {
        boolean exists;

        assertEquals("Check existence [src=" + c.src + ", dest=" + c.dest + ']', exists = c.srcFs.exists(c.src),
                c.destFs.exists(c.dest));

        assertEquals("Check types (files?) [src=" + c.src + ", dest=" + c.dest + ']', c.srcFs.isFile(c.src),
                c.destFs.isFile(c.dest));

        if (exists) {
            ContentSummary srcSummary = c.srcFs.getContentSummary(c.src);
            ContentSummary dstSummary = c.destFs.getContentSummary(c.dest);

            assertEquals("Directories number comparison failed", srcSummary.getDirectoryCount(),
                    dstSummary.getDirectoryCount());

            assertEquals("Files number comparison failed", srcSummary.getFileCount(),
                    dstSummary.getFileCount());

            assertEquals("Space consumed comparison failed", srcSummary.getSpaceConsumed(),
                    dstSummary.getSpaceConsumed());

            assertEquals("Length comparison failed", srcSummary.getLength(), dstSummary.getLength());

            // Intentionally skipping quotas checks as they can vary.
        } else {
            assertContentSummaryFails(c.srcFs, c.src);
            assertContentSummaryFails(c.destFs, c.dest);
        }

        if (!exists)
            continue;

        FileStatus[] srcSt = c.srcFs.listStatus(c.src);
        FileStatus[] destSt = c.destFs.listStatus(c.dest);

        assert srcSt != null && destSt != null : "Both not null" + " [srcSt=" + Arrays.toString(srcSt)
                + ", destSt=" + Arrays.toString(destSt) + ']';

        assertEquals("Check listing [src=" + c.src + ", dest=" + c.dest + ']', srcSt.length, destSt.length);

        // Listing of the file returns the only element with this file.
        if (srcSt.length == 1 && c.src.equals(srcSt[0].getPath())) {
            assertEquals(c.dest, destSt[0].getPath());

            assertTrue("Expects file [src=" + c.src + ", srcSt[0]=" + srcSt[0] + ']', !srcSt[0].isDir());
            assertTrue("Expects file [dest=" + c.dest + ", destSt[0]=" + destSt[0] + ']', !destSt[0].isDir());

            FSDataInputStream srcIn = null;
            FSDataInputStream destIn = null;

            try {
                srcIn = c.srcFs.open(c.src);
                destIn = c.destFs.open(c.dest);

                GridTestIoUtils.assertEqualStreams(srcIn, destIn, srcSt[0].getLen());
            } finally {
                U.closeQuiet(srcIn);
                U.closeQuiet(destIn);
            }

            continue; // Skip the following directories validations.
        }

        // Sort both arrays.
        Arrays.sort(srcSt, STATUS_COMPARATOR);
        Arrays.sort(destSt, STATUS_COMPARATOR);

        for (int i = 0; i < srcSt.length; i++)
            // Dig in deep to the last leaf, instead of collecting full tree in memory.
            queue.addFirst(new Config(c.srcFs, srcSt[i].getPath(), c.destFs, destSt[i].getPath()));

        // Add non-existent file to check in the current folder.
        String rndFile = "Non-existent file #" + UUID.randomUUID().toString();

        queue.addFirst(new Config(c.srcFs, new Path(c.src, rndFile), c.destFs, new Path(c.dest, rndFile)));
    }
}