Example usage for org.apache.hadoop.fs FileSystem globStatus

List of usage examples for org.apache.hadoop.fs FileSystem globStatus

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem globStatus.

Prototype

public FileStatus[] globStatus(Path pathPattern) throws IOException 

Source Link

Document

Return all the files that match filePattern and are not checksum files.

Usage

From source file:com.yahoo.labs.samoa.streams.fs.HDFSFileStreamSource.java

License:Apache License

public void init(Configuration config, String path, String ext) {
    this.config = config;
    this.filePaths = new ArrayList<String>();
    Path hdfsPath = new Path(path);
    FileSystem fs;
    try {/*from  w  w  w.j  a v a  2  s.c  o  m*/
        fs = FileSystem.get(config);
        FileStatus fileStat = fs.getFileStatus(hdfsPath);
        if (fileStat.isDirectory()) {
            Path filterPath = hdfsPath;
            if (ext != null) {
                filterPath = new Path(path.toString(), "*." + ext);
            } else {
                filterPath = new Path(path.toString(), "*");
            }
            FileStatus[] filesInDir = fs.globStatus(filterPath);
            for (int i = 0; i < filesInDir.length; i++) {
                if (filesInDir[i].isFile()) {
                    filePaths.add(filesInDir[i].getPath().toString());
                }
            }
        } else {
            this.filePaths.add(path);
        }
    } catch (IOException ioe) {
        throw new RuntimeException("Failed getting list of files at:" + path, ioe);
    }

    this.currentIndex = -1;
}

From source file:datafu.hourglass.demo.Examples.java

License:Apache License

private Long loadMemberCount(Path path, String timestamp) throws IOException {
    FileSystem fs = getFileSystem();
    Assert.assertTrue(fs.exists(new Path(path, timestamp)));
    for (FileStatus stat : fs.globStatus(new Path(path, timestamp + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {// w  ww.  ja v a  2  s  .c  o m
            GenericRecord r = dataFileStream.next();
            Long count = (Long) ((GenericRecord) r.get("value")).get("count");
            Assert.assertNotNull(count);
            System.out.println("found count: " + count);
            return count;
        } finally {
            dataFileStream.close();
        }
    }
    throw new RuntimeException("found no data");
}

From source file:datafu.hourglass.demo.Examples.java

License:Apache License

private HashMap<Long, Integer> loadOutputCounts(Path path, String timestamp) throws IOException {
    HashMap<Long, Integer> counts = new HashMap<Long, Integer>();
    FileSystem fs = getFileSystem();
    Assert.assertTrue(fs.exists(new Path(path, timestamp)));
    for (FileStatus stat : fs.globStatus(new Path(path, timestamp + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {//from   w w w. j  av  a  2s . c o m
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                _log.info("found: " + r.toString());
                Long memberId = (Long) ((GenericRecord) r.get("key")).get("member_id");
                Assert.assertNotNull(memberId);
                Integer count = (Integer) ((GenericRecord) r.get("value")).get("count");
                Assert.assertNotNull(count);
                Assert.assertFalse(counts.containsKey(memberId));
                counts.put(memberId, count);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}

From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java

License:Apache License

private HashMap<Long, Long> loadIntermediateCounts(Path path, String timestamp) throws IOException {
    HashMap<Long, Long> counts = new HashMap<Long, Long>();
    FileSystem fs = getFileSystem();
    String nestedPath = getNestedPathFromTimestamp(timestamp);
    Assert.assertTrue(fs.exists(new Path(_intermediatePath, nestedPath)));
    for (FileStatus stat : fs.globStatus(new Path(_intermediatePath, nestedPath + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {/*  ww  w.j  ava2s.com*/
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                Long memberId = (Long) ((GenericRecord) r.get("key")).get("id");
                Long count = (Long) ((GenericRecord) r.get("value")).get("count");
                Assert.assertFalse(counts.containsKey(memberId));
                counts.put(memberId, count);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}

From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java

License:Apache License

private HashMap<Long, Long> loadOutputCounts(Path path, String timestamp) throws IOException {
    HashMap<Long, Long> counts = new HashMap<Long, Long>();
    FileSystem fs = getFileSystem();
    Assert.assertTrue(fs.exists(new Path(path, timestamp)));
    for (FileStatus stat : fs.globStatus(new Path(path, timestamp + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {//  w w  w .  j  a  v  a  2 s.c o  m
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                Long memberId = (Long) ((GenericRecord) r.get("key")).get("id");
                Long count = (Long) ((GenericRecord) r.get("value")).get("count");
                Assert.assertFalse(counts.containsKey(memberId));
                counts.put(memberId, count);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}

From source file:datafu.hourglass.test.FirstPassCountJobTests.java

License:Apache License

private HashMap<Long, Long> loadOutputCounts(String timestamp) throws IOException {
    HashMap<Long, Long> counts = new HashMap<Long, Long>();
    FileSystem fs = getFileSystem();
    String nestedPath = getNestedPathFromTimestamp(timestamp);
    Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath)));
    for (FileStatus stat : fs.globStatus(new Path(_outputPath, nestedPath + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {/*w w w .  j a v  a 2 s. c  om*/
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                Long memberId = (Long) ((GenericRecord) r.get("key")).get("id");
                Long count = (Long) ((GenericRecord) r.get("value")).get("count");
                Assert.assertFalse(counts.containsKey(memberId));
                counts.put(memberId, count);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}

From source file:datafu.hourglass.test.PartitionCollapsingJoinTest.java

License:Apache License

private HashMap<Long, ImpressionClick> loadOutputCounts(String timestamp) throws IOException {
    HashMap<Long, ImpressionClick> counts = new HashMap<Long, ImpressionClick>();
    FileSystem fs = getFileSystem();
    Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
    for (FileStatus stat : fs.globStatus(new Path(_outputPath, timestamp + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {/*from  w  w w  .j  a  va  2  s  .c  o m*/
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                Long memberId = (Long) ((GenericRecord) r.get("key")).get("id");
                Integer impressions = (Integer) ((GenericRecord) r.get("value")).get("impressions");
                Integer clicks = (Integer) ((GenericRecord) r.get("value")).get("clicks");
                Assert.assertFalse(counts.containsKey(memberId));
                ImpressionClick data = new ImpressionClick();
                data.clicks = clicks;
                data.impressions = impressions;
                counts.put(memberId, data);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}

From source file:datafu.hourglass.test.PartitionCollapsingTests.java

License:Apache License

private HashMap<Long, Long> loadOutputCounts(String timestamp) throws IOException {
    HashMap<Long, Long> counts = new HashMap<Long, Long>();
    FileSystem fs = getFileSystem();
    Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
    for (FileStatus stat : fs.globStatus(new Path(_outputPath, timestamp + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {//from   ww  w  . j  a v a2 s.c om
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                Long memberId = (Long) ((GenericRecord) r.get("key")).get("id");
                Long count = (Long) ((GenericRecord) r.get("value")).get("count");
                Assert.assertFalse(counts.containsKey(memberId));
                counts.put(memberId, count);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}

From source file:datafu.hourglass.test.PartitionPreservingJoinTests.java

License:Apache License

private HashMap<Long, ImpressionClick> loadOutputCounts(String timestamp) throws IOException {
    HashMap<Long, ImpressionClick> counts = new HashMap<Long, ImpressionClick>();
    FileSystem fs = getFileSystem();
    String nestedPath = getNestedPathFromTimestamp(timestamp);
    Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath)));
    for (FileStatus stat : fs.globStatus(new Path(_outputPath, nestedPath + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {//from   www .ja v  a 2 s  .c o  m
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                Long memberId = (Long) ((GenericRecord) r.get("key")).get("id");
                Integer impressions = (Integer) ((GenericRecord) r.get("value")).get("impressions");
                Integer clicks = (Integer) ((GenericRecord) r.get("value")).get("clicks");
                Assert.assertFalse(counts.containsKey(memberId));
                ImpressionClick data = new ImpressionClick();
                data.clicks = clicks;
                data.impressions = impressions;
                counts.put(memberId, data);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}

From source file:datafu.hourglass.test.TestAvroJob.java

License:Apache License

private HashMap<Long, Long> loadOutputCounts(String timestamp) throws IOException {
    HashMap<Long, Long> counts = new HashMap<Long, Long>();
    FileSystem fs = getFileSystem();
    Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp)));
    for (FileStatus stat : fs.globStatus(new Path(_outputPath, timestamp + "/*.avro"))) {
        _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen()));
        FSDataInputStream is = fs.open(stat.getPath());
        DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>();
        DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader);

        try {/* w w  w .  j av a2s  .c o m*/
            while (dataFileStream.hasNext()) {
                GenericRecord r = dataFileStream.next();
                Long memberId = (Long) r.get("id");
                Long count = (Long) r.get("count");
                Assert.assertFalse(counts.containsKey(memberId));
                counts.put(memberId, count);
            }
        } finally {
            dataFileStream.close();
        }
    }
    return counts;
}