Example usage for org.apache.hadoop.mapred OutputLogFilter OutputLogFilter

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred OutputLogFilter OutputLogFilter.

Prototype

OutputLogFilter

Source Link

Usage

From source file:crunch.MaxTemperature.java

License:Apache License

public void test() throws Exception {
        Configuration conf = createJobConf();

        Path localInput = new Path("input/ncdc/micro");
        Path input = getInputDir();
        Path output = getOutputDir();

        // Copy input data into test HDFS
        getFileSystem().copyFromLocalFile(localInput, input);

        MaxTemperatureDriver driver = new MaxTemperatureDriver();
        driver.setConf(conf);//from www .  j a  v a2 s  .  c  om

        int exitCode = driver.run(new String[] { input.toString(), output.toString() });
        assertThat(exitCode, is(0));

        // Check the output is as expected
        Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
        assertThat(outputFiles.length, is(1));

        InputStream in = getFileSystem().open(outputFiles[0]);
        BufferedReader reader = new BufferedReader(new InputStreamReader(in));
        assertThat(reader.readLine(), is("1949\t111"));
        assertThat(reader.readLine(), is("1950\t22"));
        assertThat(reader.readLine(), nullValue());
        reader.close();
    }

From source file:crunch.MaxTemperature.java

License:Apache License

private void checkOutput(Configuration conf, Path output) throws IOException {
        FileSystem fs = FileSystem.getLocal(conf);
        Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(output, new OutputLogFilter()));
        assertThat(outputFiles.length, is(1));

        BufferedReader actual = asBufferedReader(fs.open(outputFiles[0]));
        BufferedReader expected = asBufferedReader(getClass().getResourceAsStream("/expected.txt"));
        String expectedLine;//  w ww.  j  a v  a 2s  . c  om
        while ((expectedLine = expected.readLine()) != null) {
            assertThat(actual.readLine(), is(expectedLine));
        }
        assertThat(actual.readLine(), nullValue());
        actual.close();
        expected.close();
    }

From source file:io.aos.t4f.hadoop.mapred.WordCountTest.java

License:Apache License

@Test
public void testCount() throws Exception {
    createTextInputFile();/*from w w w .ja  va 2s. c  o  m*/
    JobClient.runJob(createJobConf());
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    Assert.assertEquals(1, outputFiles.length);
    InputStream is = getFileSystem().open(outputFiles[0]);
    BufferedReader reader = new BufferedReader(new InputStreamReader(is));
    Assert.assertEquals("a\t2", reader.readLine());
    Assert.assertEquals("b\t1", reader.readLine());
    Assert.assertNull(reader.readLine());
    reader.close();
}

From source file:org.apache.mahout.clustering.cdbw.CDbwMapper.java

License:Apache License

public static Map<Integer, List<VectorWritable>> getRepresentativePoints(Configuration conf) {
    String statePath = conf.get(CDbwDriver.STATE_IN_KEY);
    Map<Integer, List<VectorWritable>> representativePoints = new HashMap<Integer, List<VectorWritable>>();
    try {/*  ww  w  . j ava  2  s.  co  m*/
        Path path = new Path(statePath);
        FileSystem fs = FileSystem.get(path.toUri(), conf);
        FileStatus[] status = fs.listStatus(path, new OutputLogFilter());
        for (FileStatus s : status) {
            SequenceFile.Reader reader = new SequenceFile.Reader(fs, s.getPath(), conf);
            try {
                IntWritable key = new IntWritable(0);
                VectorWritable point = new VectorWritable();
                while (reader.next(key, point)) {
                    List<VectorWritable> repPoints = representativePoints.get(key.get());
                    if (repPoints == null) {
                        repPoints = new ArrayList<VectorWritable>();
                        representativePoints.put(key.get(), repPoints);
                    }
                    repPoints.add(point);
                    point = new VectorWritable();
                }
            } finally {
                reader.close();
            }
        }
        return representativePoints;
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.unigram.likelike.lsh.TestLSHRecommendations.java

License:Apache License

private boolean dfsCheck(Configuration conf, Path outputPath) throws IOException {
    FileSystem fs = FileSystem.getLocal(conf);
    Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outputPath, new OutputLogFilter()));

    //if (outputFiles != null) {
    //    TestCase.assertEquals(outputFiles.length, 1);
    //} else {/*  w w  w  .  j  av  a2s.  c o  m*/
    //    TestCase.fail();
    //}

    BufferedReader reader = this.asBufferedReader(fs.open(outputFiles[0]));

    String line;
    MultiHashMap resultMap = new MultiHashMap();
    while ((line = reader.readLine()) != null) {
        String[] lineArray = line.split("\t");
        resultMap.put(Long.parseLong(lineArray[0]), // target 
                Long.parseLong(lineArray[1])); // recommended

    }
    this.check(resultMap);
    return true;
}

From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGeneratorIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//from   www.ja  v a2  s  .  c  o m
public void testGenerator() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);
    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCDatasetGenerator wir = new WARCDatasetGenerator();

    // run job
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    jobConf.set("mapred.child.java.opts", "-Xmx512m");
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));

    // Copy the output out of HDFS and onto local FS:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/datasets-" + output.getName());
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
        fout.close();
    }

    // Did we generate the expected multiple output files?:
    Assert.assertEquals(4, outputFiles.length);

}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqSampleGeneratorIntegrationTest.java

License:Open Source License

@Test
public void testSeqStats() throws Exception {
    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter()));
    Assert.assertEquals(1, inputFiles.length);

    // Create a file of the inputs
    File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    MDXSeqSampleGenerator wir = new MDXSeqSampleGenerator();

    // run job//from  w w  w .j  av a2  s  . com
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil
            .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter()));
    // Assert.assertEquals(1, outputFiles.length);

    // Copy the output out:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/" + output.getName());
        log.info(" --- output : " + output);
        if (dfsCluster.getFileSystem().isFile(output)) {
            InputStream is = dfsCluster.getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.close();
    }

    // Check contents of the output:
    // TBA
}

From source file:uk.bl.wa.hadoop.indexer.mdx.MDXSeqStatsGeneratorIntegrationTest.java

License:Open Source License

@Test
public void testSeqStats() throws Exception {
    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            dfsCluster.getFileSystem().listStatus(new Path(input, "mdx-seq/"), new OutputLogFilter()));
    Assert.assertEquals(1, inputFiles.length);

    // Create a file of the inputs
    File tmpInputsFile = WARCMDXGeneratorIntegrationTest.writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    MDXSeqStatsGenerator wir = new MDXSeqStatsGenerator();

    // run job/*from  ww w .  jav  a 2 s . com*/
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil
            .stat2Paths(dfsCluster.getFileSystem().listStatus(output, new OutputLogFilter()));
    // Assert.assertEquals(1, outputFiles.length);

    // Copy the output out:
    for (Path output : outputFiles) {
        FileOutputStream fout = new FileOutputStream("target/" + output.getName());
        log.info(" --- output : " + output);
        if (dfsCluster.getFileSystem().isFile(output)) {
            InputStream is = dfsCluster.getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.close();
    }

    // Check contents of the output:
    // TBA
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test//from   w  w  w  . j a v a  2s. co  m
public void testMDXGenerator() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);
    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCMDXGenerator wir = new WARCMDXGenerator();

    // run job
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1);
    jobConf.set("mapred.child.java.opts", "-Xmx512m");
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    // Default is 1 reducers (as knitting together multiple sequence files
    // is not a mere matter of concatentation):
    Assert.assertEquals(1, outputFiles.length);

    // Copy the output out of HDFS and onto local FS:
    FileOutputStream fout = new FileOutputStream(outputSeq);
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
    }
    fout.close();

    // Check contents of the output:
    Configuration config = new Configuration();
    Path path = new Path(outputSeq.getAbsolutePath());
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(config), path, config);
    WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance();
    Writable value = (Writable) reader.getValueClass().newInstance();

    MDX mdx;
    int counter = 0;
    while (reader.next(key, value)) {
        mdx = new MDX(value.toString());
        System.out.println(
                "Key is: " + key + " record_type: " + mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT());
        counter++;
    }
    assertEquals(114, counter);
    reader.close();

    // Now test the MDXSeqMerger
    testSeqMerger(outputFiles);
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java

License:Open Source License

private void testSeqMerger(Path[] inputFiles) throws Exception {

    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.outputMerged.getName(), "-r", "1" };

    // Set up the WARCIndexerRunner
    MDXSeqMerger msm = new MDXSeqMerger();

    // run job//from   w w  w .j  a  v  a2 s. co m
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    msm.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // Copy the output out of HDFS and onto local FS:
    FileOutputStream fout = new FileOutputStream(outputMergedSeq);
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(outputMerged, new OutputLogFilter()));
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
    }
    fout.close();

}