com.alexholmes.hadooputils.test.TextIOJobBuilder.java Source code

Introduction

Here is the source code for com.alexholmes.hadooputils.test.TextIOJobBuilder.java
Source

/*
 * Copyright 2012 Alex Holmes
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.alexholmes.hadooputils.test;

import com.alexholmes.hadooputils.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;

import java.io.DataOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import static org.junit.Assert.assertEquals;

/**
 * A class that helps with testing MapReduce jobs with the
 * {@link org.apache.hadoop.mapred.LocalJobRunner}, which is an in-memory MapReduce
 * implementation.
 * <p/>
 * It allows the user to create input files, and then provides some utility methods to help
 * test the output file contents generated by a MapReduce job.
 */
public class TextIOJobBuilder {

    /**
     * The default key/value separator for input files.
     */
    private String inputSeparator = "\t";

    /**
     * The default key/value separator for output files.
     */
    private String outputSeparator = "\t";

    /**
     * The input directory for the MapReduce job.
     */
    private final Path inputPath;

    /**
     * The output directory for the MapReduce job.
     */
    private final Path outputPath;

    /**
     * The (local) filesystem.
     */
    private final FileSystem fs;

    /**
     * The input strings which are written to the directory specified in {@link #inputPath}.
     */
    private List<String> inputs = new ArrayList<String>();

    /**
     * The expected output strings which are read from the directory specified in
     * {@link #outputPath}.
     */
    private List<String> expectedOutputs = new ArrayList<String>();

    /**
     * Constructor which instantiates input/output paths.
     *
     * @param fs         the file system within which input directory and
     *                   files are created
     * @param inputPath  the input directory where input files will be created
     * @param outputPath the output directory that the MapReduce job will write to
     * @throws IOException if something goes wrong
     */
    public TextIOJobBuilder(final FileSystem fs, final Path inputPath, final Path outputPath) throws IOException {
        this.fs = fs;
        if (inputPath == null) {
            this.inputPath = new Path("/input");
        } else {
            this.inputPath = inputPath;
        }
        if (outputPath == null) {
            this.outputPath = new Path("/output");
        } else {
            this.outputPath = outputPath;
        }
    }

    /**
     * Constructor which instantiates input/output paths.
     *
     * @param config     the Hadoop configuration
     * @param inputPath  the input directory where input files will be created
     * @param outputPath the output directory that the MapReduce job will write to
     * @throws IOException if something goes wrong
     */
    public TextIOJobBuilder(final Configuration config, final Path inputPath, final Path outputPath)
            throws IOException {
        this(FileSystem.get(config), inputPath, outputPath);
    }

    /**
     * Constructor which instantiates input/output paths.
     *
     * @param config the Hadoop configuration
     * @throws IOException if something goes wrong
     */
    public TextIOJobBuilder(final Configuration config) throws IOException {
        this(FileSystem.get(config), new Path("/input"), new Path("/output"));
    }

    /**
     * Constructor which instantiates input/output paths.
     *
     * @param config     the Hadoop configuration
     * @param fileSystem the Hadoop file system
     * @throws IOException if something goes wrong
     */
    public TextIOJobBuilder(final Configuration config, final FileSystem fileSystem) throws IOException {
        this(fileSystem, null, null);
    }

    /**
     * Constructor which instantiates input/output paths.
     *
     * @param fileSystem the Hadoop file system
     * @throws IOException if something goes wrong
     */
    public TextIOJobBuilder(final FileSystem fileSystem) throws IOException {
        this(fileSystem, null, null);
    }

    /**
     * Set the input file key/value separator.
     *
     * @param separator the separator
     * @return a reference to this object
     */
    public TextIOJobBuilder setInputSeparator(final String separator) {
        this.inputSeparator = separator;
        return this;
    }

    /**
     * Get the input file key/value separator.
     *
     * @return the separator
     */
    public String getInputSeparator() {
        return inputSeparator;
    }

    /**
     * Set the output file key/value separator.
     *
     * @param separator the separator
     * @return a reference to this object
     */
    public TextIOJobBuilder setOutputSeparator(final String separator) {
        this.outputSeparator = separator;
        return this;
    }

    /**
     * Get the output file key/value separator.
     *
     * @return the separator
     */
    public String getOutputSeparator() {
        return outputSeparator;
    }

    /**
     * Add a line to the inputs.
     *
     * @param line an input line
     * @return a reference to this object
     */
    public TextIOJobBuilder addInput(final String line) {
        inputs.add(line);
        return this;
    }

    /**
     * Add a single line to the inputs, where each part is separated by
     * {@link #getInputSeparator()}.
     *
     * @param parts vargs/array of tokens
     * @return a reference to this object
     */
    public TextIOJobBuilder addInput(final String... parts) {
        inputs.add(StringUtils.join(parts, inputSeparator));
        return this;
    }

    /**
     * Add a line to the expected outputs.
     *
     * @param line an output line
     * @return a reference to this object
     */
    public TextIOJobBuilder addExpectedOutput(final String line) {
        expectedOutputs.add(line);
        return this;
    }

    /**
     * Add a single line to the expected outputs, where each part is separated by
     * {@link #getOutputSeparator()}.
     *
     * @param parts vargs/array of tokens
     * @return a reference to this object
     */
    public TextIOJobBuilder addExpectedOutput(final String... parts) {
        expectedOutputs.add(StringUtils.join(parts, outputSeparator));
        return this;
    }

    /**
     * Gathers all the inputs buffered by calls to {@link #addInput(String)} or
     * {@link #addInput(String...)} and writes them to the input directory, in
     * preparation for running the MapReduce job.
     *
     * @return a reference to this object
     * @throws IOException if something goes wrong
     */
    public TextIOJobBuilder writeInputs() throws IOException {

        if (fs.exists(outputPath)) {
            fs.delete(outputPath, true);
        }
        if (fs.exists(inputPath)) {
            fs.delete(inputPath, true);
        }
        fs.mkdirs(inputPath);

        DataOutputStream stream = fs.create(new Path(inputPath, "part-0"));

        IOUtils.writeLines(inputs, String.format("%n"), stream);

        stream.close();

        return this;
    }

    /**
     * Called after the MapReduce job has completed, to verify that the outputs
     * generated by the MapReduce job align with the expected outputs that were
     * set with calls to {@link #addExpectedOutput(String)} and
     * {@link #addExpectedOutput(String...)}.
     *
     * @return a reference to this object
     * @throws IOException if something goes wrong
     */
    public TextIOJobBuilder verifyResults() throws IOException {

        FileStatus[] outputFiles = fs.listStatus(outputPath, new PathFilter() {
            @Override
            public boolean accept(final Path path) {
                return path.getName().startsWith("part");
            }
        });

        System.out.println("Output files: " + StringUtils.join(outputFiles));

        int i = 0;
        for (FileStatus file : outputFiles) {
            List<String> actualLines = FileUtils.readLines(fs, file.getPath());

            for (String actualLine : actualLines) {
                String expectedLine = expectedOutputs.get(i++);
                assertEquals(expectedLine, actualLine);
            }
        }

        assertEquals(expectedOutputs.size(), i);

        return this;
    }

    /**
     * Gets the input path.
     *
     * @return the input path
     */
    public Path getInputPath() {
        return inputPath;
    }

    /**
     * Gets the output path.
     *
     * @return the output path
     */
    public Path getOutputPath() {
        return outputPath;
    }

    /**
     * Get the file system.
     *
     * @return the file system
     */
    public FileSystem getFs() {
        return fs;
    }
}