List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths
public static Path[] stat2Paths(FileStatus[] stats)
From source file:tv.icntv.log.stb.commons.HadoopUtils.java
License:Apache License
public static boolean isLzo(Path path) throws IOException { FileSystem fileSystem = null; try {//w w w . j av a2 s. c o m fileSystem = FileSystem.get(configuration); Path[] paths = FileUtil.stat2Paths(fileSystem.listStatus(path)); for (Path p : paths) { if (!p.getName().contains(".lzo")) { return false; } } } catch (IOException e) { e.printStackTrace(); } finally { if (null != fileSystem) { fileSystem.close(); } } return true; }
From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGeneratorIntegrationTest.java
License:Open Source License
@SuppressWarnings("deprecation") @Test// www . j a v a 2s. co m public void testGenerator() throws Exception { // prepare for test // createTextInputFile(); log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths( getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter())); Assert.assertEquals(2, inputFiles.length); // Create a file of the inputs File tmpInputsFile = writeInputFile(inputFiles); // Set up arguments for the job: String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() }; // Set up the WARCIndexerRunner WARCDatasetGenerator wir = new WARCDatasetGenerator(); // run job // Job configuration: log.info("Setting up job config..."); JobConf jobConf = this.mrCluster.createJobConf(); jobConf.set("mapred.child.java.opts", "-Xmx512m"); wir.createJobConf(jobConf, args); log.info("Running job..."); JobClient.runJob(jobConf); log.info("Job finished, checking the results..."); // check the output exists Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter())); // Copy the output out of HDFS and onto local FS: for (Path output : outputFiles) { FileOutputStream fout = new FileOutputStream("target/datasets-" + output.getName()); log.info(" --- output : " + output); if (getFileSystem().isFile(output)) { InputStream is = getFileSystem().open(output); IOUtils.copy(is, fout); } else { log.info(" --- ...skipping directory..."); } fout.flush(); fout.close(); } // Did we generate the expected multiple output files?: Assert.assertEquals(4, outputFiles.length); }
From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqSampleGeneratorIntegrationTest.java
License:Open Source License
@Test public void testSeqStats() throws Exception { log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths( dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter())); Assert.assertEquals(1, inputFiles.length); // Create a file of the inputs File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles); // Set up arguments for the job: String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() }; // Set up the WARCIndexerRunner MDXSeqSampleGenerator wir = new MDXSeqSampleGenerator(); // run job/*from w w w . java 2 s . c o m*/ // Job configuration: log.info("Setting up job config..."); JobConf jobConf = this.mrCluster.createJobConf(); wir.createJobConf(jobConf, args); log.info("Running job..."); JobClient.runJob(jobConf); log.info("Job finished, checking the results..."); // check the output exists Path[] outputFiles = FileUtil .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter())); // Assert.assertEquals(1, outputFiles.length); // Copy the output out: for (Path output : outputFiles) { FileOutputStream fout = new FileOutputStream("target/" + output.getName()); log.info(" --- output : " + output); if (dfsCluster.getFileSystem().isFile(output)) { InputStream is = dfsCluster.getFileSystem().open(output); IOUtils.copy(is, fout); } else { log.info(" --- ...skipping directory..."); } fout.close(); } // Check contents of the output: // TBA }
From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqStatsGeneratorIntegrationTest.java
License:Open Source License
@Test public void testSeqStats() throws Exception { log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths( dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter())); Assert.assertEquals(1, inputFiles.length); // Create a file of the inputs File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles); // Set up arguments for the job: String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() }; // Set up the WARCIndexerRunner MDXSeqStatsGenerator wir = new MDXSeqStatsGenerator(); // run job/* www .j av a2 s . c om*/ // Job configuration: log.info("Setting up job config..."); JobConf jobConf = this.mrCluster.createJobConf(); wir.createJobConf(jobConf, args); log.info("Running job..."); JobClient.runJob(jobConf); log.info("Job finished, checking the results..."); // check the output exists Path[] outputFiles = FileUtil .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter())); // Assert.assertEquals(1, outputFiles.length); // Copy the output out: for (Path output : outputFiles) { FileOutputStream fout = new FileOutputStream("target/" + output.getName()); log.info(" --- output : " + output); if (dfsCluster.getFileSystem().isFile(output)) { InputStream is = dfsCluster.getFileSystem().open(output); IOUtils.copy(is, fout); } else { log.info(" --- ...skipping directory..."); } fout.close(); } // Check contents of the output: // TBA }
From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java
License:Open Source License
@SuppressWarnings("deprecation") @Test//from w w w . jav a 2 s. c o m public void testMDXGenerator() throws Exception { // prepare for test // createTextInputFile(); log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths( getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter())); Assert.assertEquals(2, inputFiles.length); // Create a file of the inputs File tmpInputsFile = writeInputFile(inputFiles); // Set up arguments for the job: String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() }; // Set up the WARCIndexerRunner WARCMDXGenerator wir = new WARCMDXGenerator(); // run job // Job configuration: log.info("Setting up job config..."); JobConf jobConf = this.mrCluster.createJobConf(); jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1); jobConf.set("mapred.child.java.opts", "-Xmx512m"); wir.createJobConf(jobConf, args); log.info("Running job..."); JobClient.runJob(jobConf); log.info("Job finished, checking the results..."); // check the output exists Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter())); // Default is 1 reducers (as knitting together multiple sequence files // is not a mere matter of concatentation): Assert.assertEquals(1, outputFiles.length); // Copy the output out of HDFS and onto local FS: FileOutputStream fout = new FileOutputStream(outputSeq); for (Path output : outputFiles) { log.info(" --- output : " + output); if (getFileSystem().isFile(output)) { InputStream is = getFileSystem().open(output); IOUtils.copy(is, fout); } else { log.info(" --- ...skipping directory..."); } fout.flush(); } fout.close(); // Check contents of the output: Configuration config = new Configuration(); Path path = new Path(outputSeq.getAbsolutePath()); SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(config), path, config); WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance(); Writable value = (Writable) reader.getValueClass().newInstance(); MDX mdx; int counter = 0; while (reader.next(key, value)) { mdx = new MDX(value.toString()); System.out.println( "Key is: " + key + " record_type: " + mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT()); counter++; } assertEquals(114, counter); reader.close(); // Now test the MDXSeqMerger testSeqMerger(outputFiles); }
From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java
License:Open Source License
private void testSeqMerger(Path[] inputFiles) throws Exception { // Create a file of the inputs File tmpInputsFile = writeInputFile(inputFiles); // Set up arguments for the job: String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.outputMerged.getName(), "-r", "1" }; // Set up the WARCIndexerRunner MDXSeqMerger msm = new MDXSeqMerger(); // run job//from w w w . j a v a 2 s . c o m log.info("Setting up job config..."); JobConf jobConf = this.mrCluster.createJobConf(); msm.createJobConf(jobConf, args); log.info("Running job..."); JobClient.runJob(jobConf); log.info("Job finished, checking the results..."); // Copy the output out of HDFS and onto local FS: FileOutputStream fout = new FileOutputStream(outputMergedSeq); Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outputMerged, new OutputLogFilter())); for (Path output : outputFiles) { log.info(" --- output : " + output); if (getFileSystem().isFile(output)) { InputStream is = getFileSystem().open(output); IOUtils.copy(is, fout); } else { log.info(" --- ...skipping directory..."); } fout.flush(); } fout.close(); }
From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunnerIntegrationTest.java
License:Open Source License
@SuppressWarnings("deprecation") @Test// w w w . ja v a2s . com public void testFullIndexerJob() throws Exception { // prepare for test //createTextInputFile(); log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter())); Assert.assertEquals(2, inputFiles.length); // Set up arguments for the job: // FIXME The input file could be written by this test. String[] args = { "-i", "src/test/resources/test-inputs.txt", "-o", this.output.getName() }; // Set up the WARCIndexerRunner WARCIndexerRunner wir = new WARCIndexerRunner(); // run job log.info("Setting up job config..."); JobConf conf = this.mrCluster.createJobConf(); conf.set("mapred.child.java.opts", "-Xmx1024m"); wir.createJobConf(conf, args); log.info("Running job..."); JobClient.runJob(conf); log.info("Job finished, checking the results..."); // check the output Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter())); //Assert.assertEquals(1, outputFiles.length); // Check contents of the output: for (Path output : outputFiles) { log.info(" --- output : " + output); if (getFileSystem().isFile(output)) { InputStream is = getFileSystem().open(output); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = null; while ((line = reader.readLine()) != null) { log.info(line); } reader.close(); } else { log.info(" --- ...skipping directory..."); } } }
From source file:uk.bl.wa.hadoop.mapreduce.hash.HdsfFileHasherIntegrationTest.java
License:Open Source License
@SuppressWarnings("deprecation") @Test//from w w w. j a va2s . c om public void testShaSumHasher() throws Exception { // prepare for test //createTextInputFile(); log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter())); Assert.assertEquals(2, inputFiles.length); // Set up arguments for the job: // FIXME The input file could be written by this test. String[] args = { "-i", "src/test/resources/test-input-dir.txt", "-o", this.output.getName() }; // run job log.info("Setting up job config..."); JobConf conf = this.mrCluster.createJobConf(); log.info("Running job..."); ToolRunner.run(conf, new HdfsFileHasher(), args); log.info("Job finished, checking the results..."); // check the output Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter())); //Assert.assertEquals(1, outputFiles.length); // Check contents of the output: int line_count = 0; for (Path output : outputFiles) { log.info(" --- output : " + output); if (getFileSystem().isFile(output)) { InputStream is = getFileSystem().open(output); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = null; while ((line = reader.readLine()) != null) { log.info(line); line_count++; // Check: if (line_count == 1) { assertEquals( "/user/andy/inputs\t722eb9d7bfeb0b2ad2dd9c8a2fd7105f2880b139e5248e9b13a41d69ec63893b9afc034751be1432d867e171f4c6293ac89fc4e85c09a72288c16fd40f5996b2 26164 /user/andy/inputs/IAH-20080430204825-00000-blackbook-truncated.warc.gz", line); } else if (line_count == 2) { assertEquals( "/user/andy/inputs\tba14747ac52ff1885905022299b4c470ad87270128939001b674c13e8787612011b4f2bd4f3c568df3b6789b7aa50ba0062c58a506debc12c57c037d10012203 18406 /user/andy/inputs/IAH-20080430204825-00000-blackbook-truncated.arc.gz", line); } } reader.close(); } else { log.info(" --- ...skipping directory..."); } } }
From source file:uk.bl.wa.hadoop.mapreduce.MapReduceTestBaseClass.java
License:Open Source License
/** * A simple test to check the setup worked: * /*from w w w . ja va2 s .co m*/ * @throws IOException */ @Test public void testSetupWorked() throws IOException { log.info("Checking input file(s) is/are present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter())); Assert.assertEquals(testWarcs.length, inputFiles.length); }
From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCStatsToolIntegrationTest.java
License:Open Source License
@Test public void testFullWARCStatsJob() throws Exception { // prepare for test // createTextInputFile(); log.info("Checking input file is present..."); // Check that the input file is present: Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter())); Assert.assertEquals(2, inputFiles.length); // Set up arguments for the job: // FIXME The input file could be written by this test. String[] args = { "src/test/resources/test-inputs.txt", this.output.getName() }; // Set up the config and tool Config config = ConfigFactory.load(); WARCStatsTool wir = new WARCStatsTool(); // run job// www .j a v a 2 s. co m log.info("Setting up job config..."); JobConf conf = this.mrCluster.createJobConf(); wir.createJobConf(conf, args); // Disable speculative execution for tests: conf.set("mapred.reduce.tasks.speculative.execution", "false"); log.info("Running job..."); JobClient.runJob(conf); log.info("Job finished, checking the results..."); // check the output Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter())); Assert.assertEquals(config.getInt("warc.hadoop.num_reducers"), outputFiles.length); // Check contents of the output: for (Path output : outputFiles) { log.info(" --- output : " + output); if (getFileSystem().isFile(output)) { InputStream is = getFileSystem().open(output); BufferedReader reader = new BufferedReader(new InputStreamReader(is)); String line = null; while ((line = reader.readLine()) != null) { log.info(line); if (line.startsWith("RECORD-TOTAL")) { assertEquals("RECORD-TOTAL\t32", line); } } reader.close(); } else { log.info(" --- ...skipping directory..."); } } // Assert.assertEquals("a\t2", reader.readLine()); // Assert.assertEquals("b\t1", reader.readLine()); // Assert.assertNull(reader.readLine()); }