List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:com.yahoo.labs.samoa.streams.fs.HDFSFileStreamSource.java
License:Apache License
public void init(Configuration config, String path, String ext) { this.config = config; this.filePaths = new ArrayList<String>(); Path hdfsPath = new Path(path); FileSystem fs; try {/*from w w w.j a v a 2 s.c o m*/ fs = FileSystem.get(config); FileStatus fileStat = fs.getFileStatus(hdfsPath); if (fileStat.isDirectory()) { Path filterPath = hdfsPath; if (ext != null) { filterPath = new Path(path.toString(), "*." + ext); } else { filterPath = new Path(path.toString(), "*"); } FileStatus[] filesInDir = fs.globStatus(filterPath); for (int i = 0; i < filesInDir.length; i++) { if (filesInDir[i].isFile()) { filePaths.add(filesInDir[i].getPath().toString()); } } } else { this.filePaths.add(path); } } catch (IOException ioe) { throw new RuntimeException("Failed getting list of files at:" + path, ioe); } this.currentIndex = -1; }
From source file:datafu.hourglass.demo.Examples.java
License:Apache License
private Long loadMemberCount(Path path, String timestamp) throws IOException { FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path, timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {// w ww. ja v a 2 s .c o m GenericRecord r = dataFileStream.next(); Long count = (Long) ((GenericRecord) r.get("value")).get("count"); Assert.assertNotNull(count); System.out.println("found count: " + count); return count; } finally { dataFileStream.close(); } } throw new RuntimeException("found no data"); }
From source file:datafu.hourglass.demo.Examples.java
License:Apache License
private HashMap<Long, Integer> loadOutputCounts(Path path, String timestamp) throws IOException { HashMap<Long, Integer> counts = new HashMap<Long, Integer>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path, timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {//from w w w. j av a 2s . c o m while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); _log.info("found: " + r.toString()); Long memberId = (Long) ((GenericRecord) r.get("key")).get("member_id"); Assert.assertNotNull(memberId); Integer count = (Integer) ((GenericRecord) r.get("value")).get("count"); Assert.assertNotNull(count); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java
License:Apache License
private HashMap<Long, Long> loadIntermediateCounts(Path path, String timestamp) throws IOException { HashMap<Long, Long> counts = new HashMap<Long, Long>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_intermediatePath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_intermediatePath, nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {/* ww w.j ava2s.com*/ while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long) ((GenericRecord) r.get("key")).get("id"); Long count = (Long) ((GenericRecord) r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
From source file:datafu.hourglass.test.FirstAndSecondPassJobTests.java
License:Apache License
private HashMap<Long, Long> loadOutputCounts(Path path, String timestamp) throws IOException { HashMap<Long, Long> counts = new HashMap<Long, Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(path, timestamp))); for (FileStatus stat : fs.globStatus(new Path(path, timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {// w w w . j a v a 2 s.c o m while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long) ((GenericRecord) r.get("key")).get("id"); Long count = (Long) ((GenericRecord) r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
From source file:datafu.hourglass.test.FirstPassCountJobTests.java
License:Apache License
private HashMap<Long, Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long, Long> counts = new HashMap<Long, Long>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_outputPath, nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {/*w w w . j a v a 2 s. c om*/ while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long) ((GenericRecord) r.get("key")).get("id"); Long count = (Long) ((GenericRecord) r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
From source file:datafu.hourglass.test.PartitionCollapsingJoinTest.java
License:Apache License
private HashMap<Long, ImpressionClick> loadOutputCounts(String timestamp) throws IOException { HashMap<Long, ImpressionClick> counts = new HashMap<Long, ImpressionClick>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath, timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {/*from w w w .j a va 2 s .c o m*/ while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long) ((GenericRecord) r.get("key")).get("id"); Integer impressions = (Integer) ((GenericRecord) r.get("value")).get("impressions"); Integer clicks = (Integer) ((GenericRecord) r.get("value")).get("clicks"); Assert.assertFalse(counts.containsKey(memberId)); ImpressionClick data = new ImpressionClick(); data.clicks = clicks; data.impressions = impressions; counts.put(memberId, data); } } finally { dataFileStream.close(); } } return counts; }
From source file:datafu.hourglass.test.PartitionCollapsingTests.java
License:Apache License
private HashMap<Long, Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long, Long> counts = new HashMap<Long, Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath, timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {//from ww w . j a v a2 s.c om while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long) ((GenericRecord) r.get("key")).get("id"); Long count = (Long) ((GenericRecord) r.get("value")).get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }
From source file:datafu.hourglass.test.PartitionPreservingJoinTests.java
License:Apache License
private HashMap<Long, ImpressionClick> loadOutputCounts(String timestamp) throws IOException { HashMap<Long, ImpressionClick> counts = new HashMap<Long, ImpressionClick>(); FileSystem fs = getFileSystem(); String nestedPath = getNestedPathFromTimestamp(timestamp); Assert.assertTrue(fs.exists(new Path(_outputPath, nestedPath))); for (FileStatus stat : fs.globStatus(new Path(_outputPath, nestedPath + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {//from www .ja v a 2 s .c o m while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long) ((GenericRecord) r.get("key")).get("id"); Integer impressions = (Integer) ((GenericRecord) r.get("value")).get("impressions"); Integer clicks = (Integer) ((GenericRecord) r.get("value")).get("clicks"); Assert.assertFalse(counts.containsKey(memberId)); ImpressionClick data = new ImpressionClick(); data.clicks = clicks; data.impressions = impressions; counts.put(memberId, data); } } finally { dataFileStream.close(); } } return counts; }
From source file:datafu.hourglass.test.TestAvroJob.java
License:Apache License
private HashMap<Long, Long> loadOutputCounts(String timestamp) throws IOException { HashMap<Long, Long> counts = new HashMap<Long, Long>(); FileSystem fs = getFileSystem(); Assert.assertTrue(fs.exists(new Path(_outputPath, timestamp))); for (FileStatus stat : fs.globStatus(new Path(_outputPath, timestamp + "/*.avro"))) { _log.info(String.format("found: %s (%d bytes)", stat.getPath(), stat.getLen())); FSDataInputStream is = fs.open(stat.getPath()); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); DataFileStream<GenericRecord> dataFileStream = new DataFileStream<GenericRecord>(is, reader); try {/* w w w . j av a2s .c o m*/ while (dataFileStream.hasNext()) { GenericRecord r = dataFileStream.next(); Long memberId = (Long) r.get("id"); Long count = (Long) r.get("count"); Assert.assertFalse(counts.containsKey(memberId)); counts.put(memberId, count); } } finally { dataFileStream.close(); } } return counts; }