Example usage for org.apache.hadoop.fs FileUtil stat2Paths

List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats) 

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:tv.icntv.log.stb.commons.HadoopUtils.java

License:Apache License

public static boolean isLzo(Path path) throws IOException {
    FileSystem fileSystem = null;
    try {//w w w  . j  av  a2  s. c  o  m
        fileSystem = FileSystem.get(configuration);
        Path[] paths = FileUtil.stat2Paths(fileSystem.listStatus(path));
        for (Path p : paths) {
            if (!p.getName().contains(".lzo")) {
                return false;
            }
        }

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (null != fileSystem) {
            fileSystem.close();
        }
    }
    return true;
}

From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGeneratorIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//  www . j a  v  a 2s. co  m
public void testGenerator() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);
    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCDatasetGenerator wir = new WARCDatasetGenerator();

    // run job
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    jobConf.set("mapred.child.java.opts", "-Xmx512m");
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));

    // Copy the output out of HDFS and onto local FS:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/datasets-" + output.getName());
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
        fout.close();
    }

    // Did we generate the expected multiple output files?:
    Assert.assertEquals(4, outputFiles.length);

}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqSampleGeneratorIntegrationTest.java

License:Open Source License

@Test
public void testSeqStats() throws Exception {
    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter()));
    Assert.assertEquals(1, inputFiles.length);

    // Create a file of the inputs
    File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    MDXSeqSampleGenerator wir = new MDXSeqSampleGenerator();

    // run job/*from w w w .  java 2 s .  c  o m*/
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil
            .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter()));
    // Assert.assertEquals(1, outputFiles.length);

    // Copy the output out:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/" + output.getName());
        log.info(" --- output : " + output);
        if (dfsCluster.getFileSystem().isFile(output)) {
            InputStream is = dfsCluster.getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.close();
    }

    // Check contents of the output:
    // TBA
}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqStatsGeneratorIntegrationTest.java

License:Open Source License

@Test
public void testSeqStats() throws Exception {
    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter()));
    Assert.assertEquals(1, inputFiles.length);

    // Create a file of the inputs
    File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    MDXSeqStatsGenerator wir = new MDXSeqStatsGenerator();

    // run job/*  www  .j  av a2  s  . c  om*/
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil
            .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter()));
    // Assert.assertEquals(1, outputFiles.length);

    // Copy the output out:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/" + output.getName());
        log.info(" --- output : " + output);
        if (dfsCluster.getFileSystem().isFile(output)) {
            InputStream is = dfsCluster.getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.close();
    }

    // Check contents of the output:
    // TBA
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//from   w w  w  .  jav  a 2  s.  c o m
public void testMDXGenerator() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);
    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCMDXGenerator wir = new WARCMDXGenerator();

    // run job
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1);
    jobConf.set("mapred.child.java.opts", "-Xmx512m");
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    // Default is 1 reducers (as knitting together multiple sequence files
    // is not a mere matter of concatentation):
    Assert.assertEquals(1, outputFiles.length);

    // Copy the output out of HDFS and onto local FS:
    FileOutputStream fout = new FileOutputStream(outputSeq);
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
    }
    fout.close();

    // Check contents of the output:
    Configuration config = new Configuration();
    Path path = new Path(outputSeq.getAbsolutePath());
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(config), path, config);
    WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance();
    Writable value = (Writable) reader.getValueClass().newInstance();

    MDX mdx;
    int counter = 0;
    while (reader.next(key, value)) {
        mdx = new MDX(value.toString());
        System.out.println(
                "Key is: " + key + " record_type: " + mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT());
        counter++;
    }
    assertEquals(114, counter);
    reader.close();

    // Now test the MDXSeqMerger
    testSeqMerger(outputFiles);
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java

License:Open Source License

private void testSeqMerger(Path[] inputFiles) throws Exception {

    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.outputMerged.getName(), "-r", "1" };

    // Set up the WARCIndexerRunner
    MDXSeqMerger msm = new MDXSeqMerger();

    // run job//from   w  w w . j a v a  2 s .  c  o m
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    msm.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // Copy the output out of HDFS and onto local FS:
    FileOutputStream fout = new FileOutputStream(outputMergedSeq);
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outputMerged, new OutputLogFilter()));
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
    }
    fout.close();

}

From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunnerIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test// w w  w .  ja  v  a2s .  com
public void testFullIndexerJob() throws Exception {
    // prepare for test
    //createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);

    // Set up arguments for the job:
    // FIXME The input file could be written by this test.
    String[] args = { "-i", "src/test/resources/test-inputs.txt", "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCIndexerRunner wir = new WARCIndexerRunner();

    // run job
    log.info("Setting up job config...");
    JobConf conf = this.mrCluster.createJobConf();
    conf.set("mapred.child.java.opts", "-Xmx1024m");
    wir.createJobConf(conf, args);
    log.info("Running job...");
    JobClient.runJob(conf);
    log.info("Job finished, checking the results...");

    // check the output
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    //Assert.assertEquals(1, outputFiles.length);

    // Check contents of the output:
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
            String line = null;
            while ((line = reader.readLine()) != null) {
                log.info(line);
            }
            reader.close();
        } else {
            log.info(" --- ...skipping directory...");
        }
    }
}

From source file:uk.bl.wa.hadoop.mapreduce.hash.HdsfFileHasherIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//from  w w w.  j a va2s  .  c om
public void testShaSumHasher() throws Exception {
    // prepare for test
    //createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);

    // Set up arguments for the job:
    // FIXME The input file could be written by this test.
    String[] args = { "-i", "src/test/resources/test-input-dir.txt", "-o", this.output.getName() };

    // run job
    log.info("Setting up job config...");
    JobConf conf = this.mrCluster.createJobConf();
    log.info("Running job...");
    ToolRunner.run(conf, new HdfsFileHasher(), args);
    log.info("Job finished, checking the results...");

    // check the output
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    //Assert.assertEquals(1, outputFiles.length);

    // Check contents of the output:
    int line_count = 0;
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
            String line = null;
            while ((line = reader.readLine()) != null) {
                log.info(line);
                line_count++;
                // Check:
                if (line_count == 1) {
                    assertEquals(
                            "/user/andy/inputs\t722eb9d7bfeb0b2ad2dd9c8a2fd7105f2880b139e5248e9b13a41d69ec63893b9afc034751be1432d867e171f4c6293ac89fc4e85c09a72288c16fd40f5996b2 26164 /user/andy/inputs/IAH-20080430204825-00000-blackbook-truncated.warc.gz",
                            line);
                } else if (line_count == 2) {
                    assertEquals(
                            "/user/andy/inputs\tba14747ac52ff1885905022299b4c470ad87270128939001b674c13e8787612011b4f2bd4f3c568df3b6789b7aa50ba0062c58a506debc12c57c037d10012203 18406 /user/andy/inputs/IAH-20080430204825-00000-blackbook-truncated.arc.gz",
                            line);
                }
            }
            reader.close();
        } else {
            log.info(" --- ...skipping directory...");
        }
    }
}

From source file:uk.bl.wa.hadoop.mapreduce.MapReduceTestBaseClass.java

License:Open Source License

/**
 * A simple test to check the setup worked:
 * /*from  w w  w  . ja va2 s  .co  m*/
 * @throws IOException
 */
@Test
public void testSetupWorked() throws IOException {
    log.info("Checking input file(s) is/are present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(testWarcs.length, inputFiles.length);
}

From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCStatsToolIntegrationTest.java

License:Open Source License

@Test
public void testFullWARCStatsJob() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);

    // Set up arguments for the job:
    // FIXME The input file could be written by this test.
    String[] args = { "src/test/resources/test-inputs.txt", this.output.getName() };

    // Set up the config and tool
    Config config = ConfigFactory.load();
    WARCStatsTool wir = new WARCStatsTool();

    // run job//  www .j  a  v a 2 s. co  m
    log.info("Setting up job config...");
    JobConf conf = this.mrCluster.createJobConf();
    wir.createJobConf(conf, args);
    // Disable speculative execution for tests:
    conf.set("mapred.reduce.tasks.speculative.execution", "false");
    log.info("Running job...");
    JobClient.runJob(conf);
    log.info("Job finished, checking the results...");

    // check the output
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    Assert.assertEquals(config.getInt("warc.hadoop.num_reducers"), outputFiles.length);

    // Check contents of the output:
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
            String line = null;
            while ((line = reader.readLine()) != null) {
                log.info(line);
                if (line.startsWith("RECORD-TOTAL")) {
                    assertEquals("RECORD-TOTAL\t32", line);
                }
            }
            reader.close();
        } else {
            log.info(" --- ...skipping directory...");
        }
    }
    // Assert.assertEquals("a\t2", reader.readLine());
    // Assert.assertEquals("b\t1", reader.readLine());
    // Assert.assertNull(reader.readLine());
}