name.abhijitsarkar.hadoop.io.IOUtils.java Source code

Introduction

Here is the source code for name.abhijitsarkar.hadoop.io.IOUtils.java
Source

/*******************************************************************************
 * Copyright (c) 2014, the original author or authors.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * A copy of the GNU General Public License accompanies this software, 
 * and is also available at http://www.gnu.org/licenses.
 *******************************************************************************/
package name.abhijitsarkar.hadoop.io;

import java.io.BufferedReader;
import java.io.Closeable;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URI;
import java.nio.charset.StandardCharsets;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.hadoop.io.compress.CompressionOutputStream;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.util.ReflectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * This is an Utility class for the IO work.
 * 
 * @author Abhijit Sarkar
 */
public class IOUtils {
    public static final Logger LOGGER = LoggerFactory.getLogger(IOUtils.class);

    /**
     * 
     * @param key
     *            The key to be looked up
     * @param dirURI
     *            The MapFile directory
     * @param conf
     *            Job configuration
     * @return The values corresponding to the key, if found; Null otherwise
     */
    /*
     * For this method to be truly generic, caller should pass in the key and value classes
     */
    public static Writable findInMapFile(final Text key, URI dirURI, final Configuration conf) {
        MapFile.Reader reader = null;
        Writable value = null;
        LOGGER.debug("MapFile URI: {}.", dirURI);

        try {
            final Path inputPath = new Path(dirURI);
            final FileSystem fs = inputPath.getFileSystem(conf);
            reader = new MapFile.Reader(fs, dirURI.toString(), conf);

            value = reader.get(key, new Text());
        } catch (IOException ioe) {
            ioe.printStackTrace();
        } finally {
            closeStreams(reader);
        }

        LOGGER.debug("Returning value: {}, for key: {}.", value, key);

        return value;

    }

    /**
     * 
     * @param inputURI
     *            The file from which MapFile is created
     * @param conf
     *            Job configuration
     * @return MapFile URI
     * @throws Exception
     *             If fails to create the MapFile
     */
    @SuppressWarnings("resource")
    public static URI createMapFile(URI inputURI, final Configuration conf) throws Exception {

        LOGGER.debug("Attempting to create MapFile from input URI: {}.", inputURI);

        inputURI = uncompressFile(inputURI, conf);

        final Path inputPath = new Path(inputURI);
        final FileSystem fs = inputPath.getFileSystem(conf);

        final InputStream data = new FileInputStream(inputURI.getPath());
        final BufferedReader reader = new BufferedReader(new InputStreamReader(data, StandardCharsets.UTF_8));

        String line = null;
        final URI mapFileURI = new Path(inputPath.getParent(), "map_file").toUri();

        LOGGER.debug("MapFile URI: {}.", mapFileURI);

        /*
         * For this method to be truly generic, caller should pass in the key and value classes
         */
        final MapFile.Writer writer = new MapFile.Writer(conf, fs, mapFileURI.toString(), Text.class, Text.class);

        String[] tokens = null;
        String key = null;
        String value = null;

        while ((line = reader.readLine()) != null) {
            tokens = line.split("\\s", 2);

            /*
             * For this method to be truly generic, caller should pass in a token parser
             */
            if (tokens == null || tokens.length < 2) {
                LOGGER.error("Don't know how to parse line: {}.", line);
                throw new RuntimeException("Don't know how to parse line: " + line);
            }

            key = tokens[0];
            value = tokens[1];

            writer.append(new Text(key), new Text(value));
        }

        closeStreams(writer, reader, data);

        return mapFileURI;
    }

    /**
     * 
     * @param uncompressedURI
     *            The file to be archived
     * @param codecName
     *            The codec to be used for archiving
     * @param conf
     *            Job configuration
     * @return The archive URI
     * @throws Exception
     *             If fails to create the archive
     */
    public static URI compressFile(final URI uncompressedURI, final String codecName, final Configuration conf)
            throws Exception {
        /*
         * Hadoop 1.1.2 does not have a CompressionCodecFactory#getCodecByName method. Instantiating GzipCodec by using
         * new operator throws NPE (probably Hadoop bug).
         */
        final CompressionCodec codec = getCodecByClassName(GzipCodec.class.getName(), conf);
        final Path uncompressedPath = new Path(uncompressedURI);

        String archiveName = addExtension(uncompressedPath.getName(), codec.getDefaultExtension(), true);

        final Path archivePath = new Path(uncompressedPath.getParent(), archiveName);

        LOGGER.debug("uncompressedURI: {}.", uncompressedURI);
        LOGGER.debug("archiveURI: {}", archivePath.toString());

        OutputStream outputStream = null;
        InputStream inputStream = null;
        CompressionOutputStream out = null;

        try {
            outputStream = new FileOutputStream(archivePath.toUri().getPath());
            inputStream = new FileInputStream(uncompressedURI.getPath());

            out = codec.createOutputStream(outputStream);
            org.apache.hadoop.io.IOUtils.copyBytes(inputStream, out, conf, false);
            out.finish();
        } catch (IOException e) {
            throw e;
        } finally {
            closeStreams(inputStream, outputStream, out);
        }

        return archivePath.toUri();
    }

    /* Finds a codec by given class name */
    private static CompressionCodec getCodecByClassName(final String className, final Configuration conf)
            throws InstantiationException, IllegalAccessException {
        CompressionCodec codec = null;
        List<Class<? extends CompressionCodec>> allCodecClasses = CompressionCodecFactory.getCodecClasses(conf);
        for (Class<? extends CompressionCodec> clazz : allCodecClasses) {
            if (clazz.getName().equals(className)) {
                codec = ReflectionUtils.newInstance(clazz, conf);
                break;
            }
        }
        return codec;
    }

    /**
     * 
     * @param filename
     *            The filename which extension is to be changed
     * @param newExtension
     *            The new extension
     * @param removeExistingExtension
     *            Keep or replace existing extension?
     * @return Filename after appending new extension
     */
    public static String addExtension(String filename, final String newExtension,
            final boolean removeExistingExtension) {
        if (filename != null && filename.length() <= 1) {
            return filename;
        }

        LOGGER.debug("Received: {}.", filename);

        if (removeExistingExtension) {
            filename = removeExtension(filename);
        }

        LOGGER.debug("Returning: {}.", filename + newExtension);
        return filename + newExtension;
    }

    public static String removeExtension(String filename) {
        if ((filename != null && filename.length() <= 1) || filename == null) {
            return filename;
        }

        LOGGER.debug("Received: {}.", filename);

        final int idx = filename.lastIndexOf('.');

        if (idx >= 0) {
            filename = filename.substring(0, idx);
        }

        LOGGER.debug("Returning: {}.", filename);
        return filename;
    }

    /**
     * 
     * @param archiveURI
     *            The archive to be extracted
     * @param conf
     *            Job configuration
     * @return Extracted file URI
     * @throws IOException
     *             If fails to extract the archive
     */
    public static URI uncompressFile(final URI archiveURI, final Configuration conf) throws IOException {
        Path archivePath = new Path(archiveURI);
        OutputStream outputStream = null;
        InputStream inputStream = null;
        Path uncompressionPath = null;

        try {
            final FileSystem fs = FileSystem.getLocal(conf);

            FileStatus[] statuses = new FileStatus[] { fs.getFileStatus(archivePath) };
            if (statuses[0].isDir()) {
                statuses = fs.listStatus(archivePath);

                LOGGER.debug("Archive is a directory and contains {} elements.", statuses.length);

                archivePath = statuses[0].getPath();
            }

            LOGGER.debug("archiveURI: {}.", archivePath.toUri());

            final CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(archivePath);
            if (codec == null) {
                LOGGER.debug("Not an archive: {}.", archivePath.toUri());
                return archivePath.toUri();
            }

            LOGGER.debug("Using codec: {}.", codec.getClass().getName());
            uncompressionPath = new Path(addExtension(archivePath.toUri().getPath(), ".new", true));

            LOGGER.debug("uncompressedURI: {}.", uncompressionPath.toUri());

            outputStream = new FileOutputStream(uncompressionPath.toUri().getPath());

            inputStream = new FileInputStream(archivePath.toUri().getPath());
            final CompressionInputStream in = codec.createInputStream(inputStream);
            org.apache.hadoop.io.IOUtils.copyBytes(in, outputStream, conf, false);
        } catch (IOException e) {
            throw e;
        } finally {
            closeStreams(inputStream, outputStream);
        }

        return uncompressionPath.toUri();
    }

    /* Closes multiple streams at once */
    private static void closeStreams(final Closeable... streams) {
        for (Closeable stream : streams) {
            org.apache.hadoop.io.IOUtils.closeStream(stream);
        }
    }
}