Example usage for org.apache.hadoop.fs ContentSummary getFileCount

List of usage examples for org.apache.hadoop.fs ContentSummary getFileCount

Introduction

In this page you can find the example usage for org.apache.hadoop.fs ContentSummary getFileCount.

Prototype

public long getFileCount() 

Source Link

Usage

From source file:com.bigstep.datalake.JsonUtil.java

License:Apache License

/** Convert a ContentSummary to a Json string. */
public static String toJsonString(final ContentSummary contentsummary) {
    if (contentsummary == null) {
        return null;
    }//from  ww  w . ja  v  a2 s.co m

    final Map<String, Object> m = new TreeMap<String, Object>();
    m.put("length", contentsummary.getLength());
    m.put("fileCount", contentsummary.getFileCount());
    m.put("directoryCount", contentsummary.getDirectoryCount());
    m.put("quota", contentsummary.getQuota());
    m.put("spaceConsumed", contentsummary.getSpaceConsumed());
    m.put("spaceQuota", contentsummary.getSpaceQuota());
    return toJsonString(ContentSummary.class, m);
}

From source file:com.cloudera.sqoop.TestTargetDir.java

License:Apache License

/** test target-dir contains imported files. */
public void testTargetDir() throws IOException {

    try {//from   w w  w  . j a v a  2s  .  c  om
        String targetDir = getWarehouseDir() + "/tempTargetDir";

        ArrayList args = getOutputArgv(true);
        args.add("--target-dir");
        args.add(targetDir);

        // delete target-dir if exists and recreate it
        FileSystem fs = FileSystem.get(getConf());
        Path outputPath = new Path(targetDir);
        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }

        String[] argv = (String[]) args.toArray(new String[0]);
        runImport(argv);

        ContentSummary summ = fs.getContentSummary(outputPath);

        assertTrue("There's no new imported files in target-dir", summ.getFileCount() > 0);

    } catch (Exception e) {
        LOG.error("Got Exception: " + StringUtils.stringifyException(e));
        fail(e.toString());
    }
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testMapFileWrite() throws Exception {

    Path file = Testfile.MAPFILE.filepath();
    logger.debug("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//from  w w  w. ja v a 2s.com
    writeMapFile();
    logger.info("Duration: {}", stopTimer(Testfile.MAPFILE, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testHFileWrite() throws Exception {
    Path file = Testfile.HFILE.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();/*www . j  a  va  2  s .c  o  m*/
    writeHFile(file, Compression.Algorithm.NONE);
    logger.info("Duration: {}", stopTimer(Testfile.HFILE, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testHFileWriteGZ() throws Exception {
    Path file = Testfile.HFILE_GZ.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//w  w w .  ja  v a 2s .  c o m
    writeHFile(file, Compression.Algorithm.GZ);
    logger.info("Duration: {}", stopTimer(Testfile.HFILE_GZ, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testTFileWrite() throws Exception {
    Path file = Testfile.TFILE.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//from ww w .  ja  v  a  2s. c o m
    writeTFile(file, TFile.COMPRESSION_NONE);
    logger.info("Duration: {}", stopTimer(Testfile.TFILE, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.datatorrent.contrib.hdht.HadoopFilePerformanceTest.java

License:Open Source License

@Test
public void testTFileWriteGZ() throws Exception {
    Path file = Testfile.TFILE_GZ.filepath();
    logger.info("Writing {} with {} key/value pairs", file, String.format("%,d", testSize));

    startTimer();//w w  w  . j  av a 2 s .  c  o  m
    writeTFile(file, TFile.COMPRESSION_GZ);
    logger.info("Duration: {}", stopTimer(Testfile.TFILE_GZ, "WRITE"));

    Assert.assertTrue(hdfs.exists(file));
    ContentSummary fileInfo = hdfs.getContentSummary(file);
    logger.debug("Space consumed: {} bytes in {} files", String.format("%,d", fileInfo.getSpaceConsumed()),
            String.format("%,d", fileInfo.getFileCount()));
}

From source file:com.twitter.hraven.etl.JobFilePartitioner.java

License:Apache License

/**
 * @param inputPath//  w  w w .  java 2s. com
 * @throws IOException
 */
private void processHDFSSources(Path inputPath) throws IOException {
    // Try to get the fileStatus only if we're reasonably confident that this
    // is an HDFS path.s
    FileStatus inputFileStatus = hdfs.getFileStatus(inputPath);

    // Check if input is a directory
    if (!inputFileStatus.isDir()) {
        throw new IOException("Input is not a directory in HDFS: " + input);
    }

    // Accept only jobFiles and only those that fall in the desired range of
    // modification time.
    JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter(myConf,
            0L);

    ContentSummary contentSummary = hdfs.getContentSummary(inputPath);
    LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath);

    // get the files in the done folder,
    // need to traverse dirs under done recursively for versions
    // that include MAPREDUCE-323: on/after hadoop 0.20.203.0
    // on/after cdh3u5
    FileStatus[] jobFileStatusses = FileLister.listFiles(true, hdfs, inputPath, jobFileModifiedRangePathFilter);

    LOG.info("Sorting " + jobFileStatusses.length + " job files.");

    Arrays.sort(jobFileStatusses, new FileStatusModificationComparator());

    int processedCount = 0;
    try {

        for (int i = 0; i < jobFileStatusses.length; i++) {
            FileStatus jobFileStatus = jobFileStatusses[i];

            boolean retain = BatchUtil.shouldRetain(i, maXretention, jobFileStatusses.length);
            processHDFSSource(hdfs, jobFileStatus, outputPath, myConf, skipExisting, retain);
            processedCount++;
            // Print something each 1k files to show progress.
            if ((i % 1000) == 0) {
                LOG.info("Processed " + i + " files.");
            }

        }

    } finally {
        LOG.info("Processed " + processedCount + " files.");
    }
}

From source file:com.twitter.hraven.etl.JobFilePreprocessor.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    // When we started processing. This is also the upper limit of files we
    // accept, next run will pick up the new incoming files.
    long processingStartMillis = System.currentTimeMillis();

    Configuration hbaseConf = HBaseConfiguration.create(getConf());

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Output should be an hdfs path.
    FileSystem hdfs = FileSystem.get(hbaseConf);

    // Grab the input path argument
    String output = commandLine.getOptionValue("o");
    LOG.info(" output=" + output);
    Path outputPath = new Path(output);
    FileStatus outputFileStatus = hdfs.getFileStatus(outputPath);

    if (!outputFileStatus.isDir()) {
        throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName());
    }/*from w  ww .j  a  v a2 s.  c  o m*/

    // Grab the input path argument
    String input;
    if (commandLine.hasOption("i")) {
        input = commandLine.getOptionValue("i");
    } else {
        input = hbaseConf.get("mapred.job.tracker.history.completed.location");
    }
    LOG.info("input=" + input);

    // Grab the batch-size argument
    int batchSize;
    if (commandLine.hasOption("b")) {
        try {
            batchSize = Integer.parseInt(commandLine.getOptionValue("b"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe);
        }
        // Additional check
        if (batchSize < 1) {
            throw new IllegalArgumentException(
                    "Cannot process files in batches smaller than 1. Specified batch size option -b is: "
                            + commandLine.getOptionValue("b"));
        }
    } else {
        batchSize = DEFAULT_BATCH_SIZE;
    }

    boolean forceAllFiles = commandLine.hasOption("f");
    LOG.info("forceAllFiles: " + forceAllFiles);

    Path inputPath = new Path(input);
    FileStatus inputFileStatus = hdfs.getFileStatus(inputPath);

    if (!inputFileStatus.isDir()) {
        throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName());
    }

    // Grab the cluster argument
    String cluster = commandLine.getOptionValue("c");
    LOG.info("cluster=" + cluster);

    /**
     * Grab the size of huge files to be moved argument
     * hbase cell can't store files bigger than
     * maxFileSize, hence no need to consider them for rawloading
     * Reference:
     * {@link https://github.com/twitter/hraven/issues/59}
     */
    String maxFileSizeStr = commandLine.getOptionValue("s");
    LOG.info("maxFileSize=" + maxFileSizeStr);
    long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT;
    try {
        maxFileSize = Long.parseLong(maxFileSizeStr);
    } catch (NumberFormatException nfe) {
        throw new ProcessingException(
                "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe);
    }

    ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf);

    boolean success = true;
    try {

        // Figure out where we last left off (if anywhere at all)
        ProcessRecord lastProcessRecord = null;

        if (!forceAllFiles) {
            lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster);
        }

        long minModificationTimeMillis = 0;
        if (lastProcessRecord != null) {
            // Start of this time period is the end of the last period.
            minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis();
        }

        // Do a sanity check. The end time of the last scan better not be later
        // than when we started processing.
        if (minModificationTimeMillis > processingStartMillis) {
            throw new RuntimeException("The last processing record has maxModificationMillis later than now: "
                    + lastProcessRecord);
        }

        // Accept only jobFiles and only those that fall in the desired range of
        // modification time.
        JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter(
                hbaseConf, minModificationTimeMillis);

        String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis));

        ContentSummary contentSummary = hdfs.getContentSummary(inputPath);
        LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath
                + " that are modified since " + timestamp);

        // get the files in the done folder,
        // need to traverse dirs under done recursively for versions
        // that include MAPREDUCE-323: on/after hadoop 0.20.203.0
        // on/after cdh3u5
        FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath,
                jobFileModifiedRangePathFilter);

        LOG.info("Sorting " + jobFileStatusses.length + " job files.");

        Arrays.sort(jobFileStatusses, new FileStatusModificationComparator());

        // Process these files in batches at a time.
        int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize);
        LOG.info("Batch count: " + batchCount);
        for (int b = 0; b < batchCount; b++) {
            processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath);
        }

    } finally {
        processRecordService.close();
    }

    Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass());
    if (statistics != null) {
        LOG.info("HDFS bytes read: " + statistics.getBytesRead());
        LOG.info("HDFS bytes written: " + statistics.getBytesWritten());
        LOG.info("HDFS read ops: " + statistics.getReadOps());
        LOG.info("HDFS large read ops: " + statistics.getLargeReadOps());
        LOG.info("HDFS write ops: " + statistics.getWriteOps());
    }

    // Return the status
    return success ? 0 : 1;
}

From source file:org.apache.ignite.igfs.IgfsHadoopFileSystemAbstractSelfTest.java

License:Apache License

/**
 * Compare content of two folders.// www . j  a v  a  2  s .  c  o  m
 *
 * @param cfg Paths configuration to compare.
 * @throws IOException If failed.
 */
@SuppressWarnings("deprecation")
private void compareContent(Config cfg) throws IOException {
    Deque<Config> queue = new LinkedList<>();

    queue.add(cfg);

    for (Config c = queue.poll(); c != null; c = queue.poll()) {
        boolean exists;

        assertEquals("Check existence [src=" + c.src + ", dest=" + c.dest + ']', exists = c.srcFs.exists(c.src),
                c.destFs.exists(c.dest));

        assertEquals("Check types (files?) [src=" + c.src + ", dest=" + c.dest + ']', c.srcFs.isFile(c.src),
                c.destFs.isFile(c.dest));

        if (exists) {
            ContentSummary srcSummary = c.srcFs.getContentSummary(c.src);
            ContentSummary dstSummary = c.destFs.getContentSummary(c.dest);

            assertEquals("Directories number comparison failed", srcSummary.getDirectoryCount(),
                    dstSummary.getDirectoryCount());

            assertEquals("Files number comparison failed", srcSummary.getFileCount(),
                    dstSummary.getFileCount());

            assertEquals("Space consumed comparison failed", srcSummary.getSpaceConsumed(),
                    dstSummary.getSpaceConsumed());

            assertEquals("Length comparison failed", srcSummary.getLength(), dstSummary.getLength());

            // Intentionally skipping quotas checks as they can vary.
        } else {
            assertContentSummaryFails(c.srcFs, c.src);
            assertContentSummaryFails(c.destFs, c.dest);
        }

        if (!exists)
            continue;

        FileStatus[] srcSt = c.srcFs.listStatus(c.src);
        FileStatus[] destSt = c.destFs.listStatus(c.dest);

        assert srcSt != null && destSt != null : "Both not null" + " [srcSt=" + Arrays.toString(srcSt)
                + ", destSt=" + Arrays.toString(destSt) + ']';

        assertEquals("Check listing [src=" + c.src + ", dest=" + c.dest + ']', srcSt.length, destSt.length);

        // Listing of the file returns the only element with this file.
        if (srcSt.length == 1 && c.src.equals(srcSt[0].getPath())) {
            assertEquals(c.dest, destSt[0].getPath());

            assertTrue("Expects file [src=" + c.src + ", srcSt[0]=" + srcSt[0] + ']', !srcSt[0].isDir());
            assertTrue("Expects file [dest=" + c.dest + ", destSt[0]=" + destSt[0] + ']', !destSt[0].isDir());

            FSDataInputStream srcIn = null;
            FSDataInputStream destIn = null;

            try {
                srcIn = c.srcFs.open(c.src);
                destIn = c.destFs.open(c.dest);

                GridTestIoUtils.assertEqualStreams(srcIn, destIn, srcSt[0].getLen());
            } finally {
                U.closeQuiet(srcIn);
                U.closeQuiet(destIn);
            }

            continue; // Skip the following directories validations.
        }

        // Sort both arrays.
        Arrays.sort(srcSt, STATUS_COMPARATOR);
        Arrays.sort(destSt, STATUS_COMPARATOR);

        for (int i = 0; i < srcSt.length; i++)
            // Dig in deep to the last leaf, instead of collecting full tree in memory.
            queue.addFirst(new Config(c.srcFs, srcSt[i].getPath(), c.destFs, destSt[i].getPath()));

        // Add non-existent file to check in the current folder.
        String rndFile = "Non-existent file #" + UUID.randomUUID().toString();

        queue.addFirst(new Config(c.srcFs, new Path(c.src, rndFile), c.destFs, new Path(c.dest, rndFile)));
    }
}