Example usage for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats)

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:tv.icntv.log.stb.commons.HadoopUtils.java

License:Apache License

public static boolean isLzo(Path path) throws IOException {
    FileSystem fileSystem = null;
    try {//w w w  . j  av  a2  s. c  o  m
        fileSystem = FileSystem.get(configuration);
        Path[] paths = FileUtil.stat2Paths(fileSystem.listStatus(path));
        for (Path p : paths) {
            if (!p.getName().contains(".lzo")) {
                return false;
            }
        }

    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        if (null != fileSystem) {
            fileSystem.close();
        }
    }
    return true;
}

From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGeneratorIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//  www . j a  v  a 2s. co  m
public void testGenerator() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);
    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCDatasetGenerator wir = new WARCDatasetGenerator();

    // run job
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    jobConf.set("mapred.child.java.opts", "-Xmx512m");
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));

    // Copy the output out of HDFS and onto local FS:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/datasets-" + output.getName());
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
        fout.close();
    }

    // Did we generate the expected multiple output files?:
    Assert.assertEquals(4, outputFiles.length);

}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqSampleGeneratorIntegrationTest.java

License:Open Source License

@Test
public void testSeqStats() throws Exception {
    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter()));
    Assert.assertEquals(1, inputFiles.length);

    // Create a file of the inputs
    File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    MDXSeqSampleGenerator wir = new MDXSeqSampleGenerator();

    // run job/*from w w w .  java 2 s .  c  o m*/
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil
            .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter()));
    // Assert.assertEquals(1, outputFiles.length);

    // Copy the output out:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/" + output.getName());
        log.info(" --- output : " + output);
        if (dfsCluster.getFileSystem().isFile(output)) {
            InputStream is = dfsCluster.getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.close();
    }

    // Check contents of the output:
    // TBA
}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqStatsGeneratorIntegrationTest.java

License:Open Source License

@Test
public void testSeqStats() throws Exception {
    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter()));
    Assert.assertEquals(1, inputFiles.length);

    // Create a file of the inputs
    File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    MDXSeqStatsGenerator wir = new MDXSeqStatsGenerator();

    // run job/*  www  .j  av a2  s  . c  om*/
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil
            .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter()));
    // Assert.assertEquals(1, outputFiles.length);

    // Copy the output out:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/" + output.getName());
        log.info(" --- output : " + output);
        if (dfsCluster.getFileSystem().isFile(output)) {
            InputStream is = dfsCluster.getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.close();
    }

    // Check contents of the output:
    // TBA
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//from   w w  w  .  jav  a 2  s.  c o m
public void testMDXGenerator() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);
    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCMDXGenerator wir = new WARCMDXGenerator();

    // run job
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1);
    jobConf.set("mapred.child.java.opts", "-Xmx512m");
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    // Default is 1 reducers (as knitting together multiple sequence files
    // is not a mere matter of concatentation):
    Assert.assertEquals(1, outputFiles.length);

    // Copy the output out of HDFS and onto local FS:
    FileOutputStream fout = new FileOutputStream(outputSeq);
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
    }
    fout.close();

    // Check contents of the output:
    Configuration config = new Configuration();
    Path path = new Path(outputSeq.getAbsolutePath());
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(config), path, config);
    WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance();
    Writable value = (Writable) reader.getValueClass().newInstance();

    MDX mdx;
    int counter = 0;
    while (reader.next(key, value)) {
        mdx = new MDX(value.toString());
        System.out.println(
                "Key is: " + key + " record_type: " + mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT());
        counter++;
    }
    assertEquals(114, counter);
    reader.close();

    // Now test the MDXSeqMerger
    testSeqMerger(outputFiles);
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java

License:Open Source License

private void testSeqMerger(Path[] inputFiles) throws Exception {

    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.outputMerged.getName(), "-r", "1" };

    // Set up the WARCIndexerRunner
    MDXSeqMerger msm = new MDXSeqMerger();

    // run job//from   w  w w . j a v a  2 s .  c  o m
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    msm.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // Copy the output out of HDFS and onto local FS:
    FileOutputStream fout = new FileOutputStream(outputMergedSeq);
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outputMerged, new OutputLogFilter()));
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
    }
    fout.close();

}

From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunnerIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test// w w  w .  ja  v  a2s .  com
public void testFullIndexerJob() throws Exception {
    // prepare for test
    //createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);

    // Set up arguments for the job:
    // FIXME The input file could be written by this test.
    String[] args = { "-i", "src/test/resources/test-inputs.txt", "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCIndexerRunner wir = new WARCIndexerRunner();

    // run job
    log.info("Setting up job config...");
    JobConf conf = this.mrCluster.createJobConf();
    conf.set("mapred.child.java.opts", "-Xmx1024m");
    wir.createJobConf(conf, args);
    log.info("Running job...");
    JobClient.runJob(conf);
    log.info("Job finished, checking the results...");

    // check the output
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    //Assert.assertEquals(1, outputFiles.length);

    // Check contents of the output:
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
            String line = null;
            while ((line = reader.readLine()) != null) {
                log.info(line);
            }
            reader.close();
        } else {
            log.info(" --- ...skipping directory...");
        }
    }
}

From source file:uk.bl.wa.hadoop.mapreduce.hash.HdsfFileHasherIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//from  w w w.  j a va2s  .  c om
public void testShaSumHasher() throws Exception {
    // prepare for test
    //createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);

    // Set up arguments for the job:
    // FIXME The input file could be written by this test.
    String[] args = { "-i", "src/test/resources/test-input-dir.txt", "-o", this.output.getName() };

    // run job
    log.info("Setting up job config...");
    JobConf conf = this.mrCluster.createJobConf();
    log.info("Running job...");
    ToolRunner.run(conf, new HdfsFileHasher(), args);
    log.info("Job finished, checking the results...");

    // check the output
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    //Assert.assertEquals(1, outputFiles.length);

    // Check contents of the output:
    int line_count = 0;
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
            String line = null;
            while ((line = reader.readLine()) != null) {
                log.info(line);
                line_count++;
                // Check:
                if (line_count == 1) {
                    assertEquals(
                            "/user/andy/inputs\t722eb9d7bfeb0b2ad2dd9c8a2fd7105f2880b139e5248e9b13a41d69ec63893b9afc034751be1432d867e171f4c6293ac89fc4e85c09a72288c16fd40f5996b2 26164 /user/andy/inputs/IAH-20080430204825-00000-blackbook-truncated.warc.gz",
                            line);
                } else if (line_count == 2) {
                    assertEquals(
                            "/user/andy/inputs\tba14747ac52ff1885905022299b4c470ad87270128939001b674c13e8787612011b4f2bd4f3c568df3b6789b7aa50ba0062c58a506debc12c57c037d10012203 18406 /user/andy/inputs/IAH-20080430204825-00000-blackbook-truncated.arc.gz",
                            line);
                }
            }
            reader.close();
        } else {
            log.info(" --- ...skipping directory...");
        }
    }
}

From source file:uk.bl.wa.hadoop.mapreduce.MapReduceTestBaseClass.java

License:Open Source License

/**
 * A simple test to check the setup worked:
 * /*from  w w  w  . ja va2 s  .co  m*/
 * @throws IOException
 */
@Test
public void testSetupWorked() throws IOException {
    log.info("Checking input file(s) is/are present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(testWarcs.length, inputFiles.length);
}

From source file:uk.bl.wa.hadoop.mapreduce.warcstats.WARCStatsToolIntegrationTest.java

License:Open Source License

@Test
public void testFullWARCStatsJob() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(input, new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);

    // Set up arguments for the job:
    // FIXME The input file could be written by this test.
    String[] args = { "src/test/resources/test-inputs.txt", this.output.getName() };

    // Set up the config and tool
    Config config = ConfigFactory.load();
    WARCStatsTool wir = new WARCStatsTool();

    // run job//  www .j  a  v a 2 s. co  m
    log.info("Setting up job config...");
    JobConf conf = this.mrCluster.createJobConf();
    wir.createJobConf(conf, args);
    // Disable speculative execution for tests:
    conf.set("mapred.reduce.tasks.speculative.execution", "false");
    log.info("Running job...");
    JobClient.runJob(conf);
    log.info("Job finished, checking the results...");

    // check the output
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    Assert.assertEquals(config.getInt("warc.hadoop.num_reducers"), outputFiles.length);

    // Check contents of the output:
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            BufferedReader reader = new BufferedReader(new InputStreamReader(is));
            String line = null;
            while ((line = reader.readLine()) != null) {
                log.info(line);
                if (line.startsWith("RECORD-TOTAL")) {
                    assertEquals("RECORD-TOTAL\t32", line);
                }
            }
            reader.close();
        } else {
            log.info(" --- ...skipping directory...");
        }
    }
    // Assert.assertEquals("a\t2", reader.readLine());
    // Assert.assertEquals("b\t1", reader.readLine());
    // Assert.assertNull(reader.readLine());
}