List of usage examples for org.apache.hadoop.fs FileSystem listStatus
public FileStatus[] listStatus(Path[] files, PathFilter filter) throws FileNotFoundException, IOException
From source file:GetRetweetersAndCountPerUser.java
License:Apache License
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: GetRetweetersAndCountPerUser <in> <out> <num_reducers>"); System.exit(2);//from w ww . ja v a 2 s . c o m } Job job = new Job(conf, "word count"); job.setJarByClass(RetweetersPerUser.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); System.out.println(otherArgs[0]); job.setMapperClass(TweetMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(Integer.parseInt(args[2])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); if (job.waitForCompletion(true)) { FileSystem hdfs = FileSystem.get(new URI(args[1]), conf); Path dir = new Path(args[1]); PathFilter filter = new PathFilter() { public boolean accept(Path file) { return file.getName().startsWith("part-r-"); } }; HashMap<Integer, Integer> counts_for_user = new HashMap<Integer, Integer>(); FileStatus[] files = hdfs.listStatus(dir, filter); Arrays.sort(files); for (int i = 0; i != files.length; i++) { Path pt = files[i].getPath(); BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(pt))); String line = null; while ((line = br.readLine()) != null) { String[] columns = new String[2]; columns = line.split("\t"); int key = Integer.parseInt(columns[0]); if (counts_for_user.containsKey(key)) counts_for_user.put(key, counts_for_user.get(key) + 1); else counts_for_user.put(key, 1); } br.close(); } FSDataOutputStream fsDataOutputStream = hdfs.create(new Path(otherArgs[1] + "_count")); PrintWriter writer = new PrintWriter(fsDataOutputStream); for (Entry<Integer, Integer> e : counts_for_user.entrySet()) { writer.write(e.getKey() + "\t" + e.getValue() + "\n"); } writer.close(); fsDataOutputStream.close(); hdfs.close(); System.exit(0); } System.exit(1); }
From source file:HadoopUtilsTest.java
License:Apache License
public static void main(String[] args) throws IOException { Configuration confgiruration = HBaseConfiguration.create(); FileSystem fileSystem = null; try {//from w w w . j av a2s. c o m fileSystem = FileSystem.get(confgiruration); FileStatus[] fileStatuses = fileSystem.listStatus(new Path("/icntv/grade/correlate-result/2013-12-12"), new PathFilter() { @Override public boolean accept(Path path) { return path.getName().matches("part-r-\\d*"); } }); for (FileStatus f : fileStatuses) { IOUtils.copyBytes(fileSystem.open(f.getPath()), System.out, 4096, false); } } catch (Exception e) { e.printStackTrace(); } finally { if (null != fileSystem) { fileSystem.close(); } } }
From source file:DisplayClustering.java
License:Apache License
protected static void loadClustersWritable(Path output) throws IOException { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(output.toUri(), conf); for (FileStatus s : fs.listStatus(output, new ClustersFilter())) { List<Cluster> clusters = readClustersWritable(s.getPath()); CLUSTERS.add(clusters);// w w w . j a v a 2 s . c o m } }
From source file:be.uantwerpen.adrem.bigfim.AprioriPhaseReducer.java
License:Apache License
private int getLargestIndex(Configuration conf, Path path, String prefix, int index) { int largestIx = -1; try {//from w w w .ja v a 2 s. co m FileSystem fs = path.getFileSystem(conf); for (FileStatus file : fs.listStatus(path, new NameStartsWithFilter(prefix))) { largestIx = max(largestIx, parseInt(file.getPath().getName().split("-")[index])); } } catch (NumberFormatException e) { } catch (IOException e) { } return largestIx; }
From source file:cascading.avro.AvroScheme.java
License:Apache License
/** * This method peeks at the source data to get a schema when none has been provided. * * @param flowProcess The cascading FlowProcess object for this flow. * @param tap The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none exists. *//*w ww. j av a 2 s. co m*/ private Schema getSourceSchema(FlowProcess<JobConf> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); }
From source file:cascading.scheme.DeprecatedAvroScheme.java
License:Apache License
/** * This method peeks at the source data to get a schema when none has been provided. * * @param flowProcess The cascading FlowProcess object for this flow. * @param tap The cascading Tap object. * @return Schema The schema of the peeked at data, or Schema.NULL if none exists. *//*from www . j a v a 2s. c om*/ private Schema getSourceSchema(FlowProcess<? extends Configuration> flowProcess, Tap tap) throws IOException { if (tap instanceof CompositeTap) { tap = (Tap) ((CompositeTap) tap).getChildTaps().next(); } final String path = tap.getIdentifier(); Path p = new Path(path); final FileSystem fs = p.getFileSystem(flowProcess.getConfigCopy()); // Get all the input dirs List<FileStatus> statuses = new LinkedList<FileStatus>(Arrays.asList(fs.globStatus(p, filter))); // Now get all the things that are one level down for (FileStatus status : new LinkedList<FileStatus>(statuses)) { if (status.isDir()) for (FileStatus child : Arrays.asList(fs.listStatus(status.getPath(), filter))) { if (child.isDir()) { statuses.addAll(Arrays.asList(fs.listStatus(child.getPath(), filter))); } else if (fs.isFile(child.getPath())) { statuses.add(child); } } } for (FileStatus status : statuses) { Path statusPath = status.getPath(); if (fs.isFile(statusPath)) { // no need to open them all InputStream stream = null; DataFileStream reader = null; try { stream = new BufferedInputStream(fs.open(statusPath)); reader = new DataFileStream(stream, new GenericDatumReader()); return reader.getSchema(); } finally { if (reader == null) { if (stream != null) { stream.close(); } } else { reader.close(); } } } } // couldn't find any Avro files, return null schema return Schema.create(Schema.Type.NULL); }
From source file:co.cask.cdap.internal.app.runtime.batch.dataset.partitioned.DynamicPartitioningOutputCommitter.java
License:Apache License
/** * Get a list of all paths where output from committed tasks are stored. * @param context the context of the current job * @return the list of these Paths/FileStatuses. * @throws IOException/*from w ww . ja v a 2 s. c om*/ */ private FileStatus[] getAllCommittedTaskPaths(JobContext context) throws IOException { Path jobAttemptPath = getJobAttemptPath(context); FileSystem fs = jobAttemptPath.getFileSystem(context.getConfiguration()); return fs.listStatus(jobAttemptPath, new CommittedTaskFilter()); }
From source file:co.nubetech.hiho.dedup.TestDedupJob.java
License:Apache License
@Test public void testDedupByValueWithDelimitedTextInputFormat() throws Exception { final String inputData1 = "Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"; final String inputData2 = "Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-inputFormat", "co.nubetech.hiho.dedup.DelimitedTextInputFormat", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-inputPath", "/input1,/input2", "-outputPath", "output", "-delimeter", ",", "-column", "1", "-dedupBy", "value" }; DedupJob job = runDedupJob(args);/*ww w .jav a 2s . c om*/ assertEquals(6, job.getTotalRecordsRead()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); assertEquals(1, job.getDuplicateRecords()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"); expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"); expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"); expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"); expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }
From source file:co.nubetech.hiho.dedup.TestDedupJob.java
License:Apache License
@Test public void testDedupByValueWithTextInputFormat() throws Exception { final String inputData1 = "Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"; final String inputData2 = "Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos\n" + "Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein\n" + "Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"; createTextFileInHDFS(inputData1, "/input1", "testFile1.txt"); createTextFileInHDFS(inputData2, "/input2", "testFile2.txt"); String[] args = new String[] { "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "-inputPath", "/input1,/input2", "-outputPath", "output", "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-dedupBy", "value" }; DedupJob job = runDedupJob(args);//from w w w . j av a 2 s.com assertEquals(6, job.getTotalRecordsRead()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); assertEquals(1, job.getDuplicateRecords()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"); expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"); expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"); expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"); expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }
From source file:co.nubetech.hiho.dedup.TestDedupJob.java
License:Apache License
@Test public void testDedupByValueWithSequenceFileAsTextInputFormat() throws Exception { HashMap<Text, Text> inputData1 = new HashMap<Text, Text>(); inputData1.put(new Text("1"), new Text("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney")); inputData1.put(new Text("2"), new Text("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson")); inputData1.put(new Text("3"), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein")); createSequenceFileInHdfs(inputData1, "/input1", "testFile1.seq"); HashMap<Text, Text> inputData2 = new HashMap<Text, Text>(); inputData2.put(new Text("1"), new Text("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos")); inputData2.put(new Text("2"), new Text("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson")); inputData2.put(new Text("4"), new Text("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein")); createSequenceFileInHdfs(inputData2, "/input2", "testFile2.seq"); String[] args = new String[] { "-inputFormat", "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsTextInputFormat", "-outputFormat", "co.nubetech.hiho.mapreduce.lib.output.NoKeyOnlyValueOutputFormat", "-inputPath", "/input1,/input2", "-outputPath", "output", "-inputKeyClassName", "org.apache.hadoop.io.Text", "-inputValueClassName", "org.apache.hadoop.io.Text", "-dedupBy", "value" }; DedupJob job = runDedupJob(args);/*from ww w .java2s. c o m*/ assertEquals(6, job.getTotalRecordsRead()); assertEquals(0, job.getBadRecords()); assertEquals(5, job.getOutput()); assertEquals(1, job.getDuplicateRecords()); FileSystem outputFS = getFileSystem(); Path outputPath = new Path(outputFS.getHomeDirectory(), "output"); FileStatus[] status = outputFS.listStatus(outputPath, getOutputPathFilter()); assertTrue(outputFS.exists(outputPath)); List<String> expectedOutput = new ArrayList<String>(); expectedOutput.add("Xavier Wilson,Mason Holloway,Carlos Johnston,Martin Noel,Drake Mckinney"); expectedOutput.add("Kennedy Bailey,Jerome Perry,David Cabrera,Edan Fleming,Orlando Tyson"); expectedOutput.add("Drake Mckinney,Murphy Baird,Theodore Lindsey,Nehru Wilcox,Harper Klein"); expectedOutput.add("Zephania Bauer,Jermaine Gordon,Vincent Moon,Steven Pierce,Jasper Campos"); expectedOutput.add("Kennedy Bailey,Plato Atkinson,Stuart Guy,Rooney Levy,Judah Benson"); int count = 0; for (FileStatus fileStat : status) { logger.debug("File status is " + fileStat.getPath() + " and is it a dir? " + fileStat.isDirectory()); FSDataInputStream in = outputFS.open(fileStat.getPath()); String line = null; while ((line = in.readLine()) != null) { logger.debug("Output is " + line); assertTrue("Matched output " + line, expectedOutput.contains(line)); expectedOutput.remove(line); count++; } in.close(); } assertEquals(5, count); }