com.github.sadikovi.riff.Riff.java Source code

Java tutorial

Introduction

Here is the source code for com.github.sadikovi.riff.Riff.java

Source

/*
 * Copyright (c) 2017 sadikovi
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package com.github.sadikovi.riff;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.github.sadikovi.riff.io.CompressionCodec;
import com.github.sadikovi.riff.io.CompressionCodecFactory;

/**
 * [[Riff]] class is the main entrypoint of working with Riff file format.
 * It exposes several primary methods to create either writer or reader (including metadata) with
 * configuration options.
 *
 * Example of writing/reading simple file:
 * {{{
 * // writing ".gz" file
 * org.apache.hadoop.conf.Configuration conf = ...
 * org.apache.hadoop.fs.Path path = new org.apache.hadoop.fs.Path("file.gz");
 * TypeDescription td = new TypeDescription(structType, indexFields);
 * FileWriter writer = Riff.writer(conf, path, td);
 *
 * // set custom file property
 * writer.setFileProperty("key", "value");
 *
 * writer.prepareWrite();
 * while (rows.hasNext()) {
 *   writer.write(rows.next());
 * }
 * writer.finishWrite();
 *
 * // reading file
 * TreeNode filter = eqt("fieldName", "value");
 * FileReader reader = Riff.reader(conf, new org.apache.hadoop.fs.Path("file.gz"));
 *
 * RowBuffer rowbuf = reader.prepareRead(filter);
 *
 * // get custom file property, must be called after prepareRead() or readFileInfo()
 * String value = reader.getFileProperty("key");
 *
 * while (rowbuf.hasNext()) {
 *   process(rowbuf.next()); // user-specific processing of an InternalRow
 * }
 * rowbuf.close();
 * }}}
 *
 * See additional methods to set options for write/read, such as enforcing compression codec,
 * specifying file system, HDFS buffer size, in/out stream buffer size, type description, etc.
 */
public class Riff {
    private static final Logger LOG = LoggerFactory.getLogger(Riff.class);
    // magic for Riff file, "RIFF" bytes in UTF8 charset
    public static final int MAGIC = 1380533830;

    /**
     * Internal riff options that can be set in hadoop configuration.
     */
    public static class Options {
        // short name for compression codec
        public static final String COMPRESSION_CODEC = "riff.compression.codec";

        // Number of rows in single stripe, this is used for writing only
        public static final String STRIPE_ROWS = "riff.stripe.rows";
        public static final int STRIPE_ROWS_DEFAULT = 10000;

        // buffer size in bytes
        public static final String BUFFER_SIZE = "riff.buffer.size";
        public static final int BUFFER_SIZE_DEFAULT = 256 * 1024;
        public static final int BUFFER_SIZE_MIN = 4 * 1024;
        public static final int BUFFER_SIZE_MAX = 512 * 1024;

        // buffer size for Hadoop output/input stream
        public static final String HDFS_BUFFER_SIZE = "io.file.buffer.size";
        // default buffer size for HDFS, should be multiple of 4096 bytes (core-default.xml)
        public static final int HDFS_BUFFER_SIZE_DEFAULT = 256 * 1024;

        // whether or not column filters are enabled and should be written into header
        public static final String COLUMN_FILTER_ENABLED = "riff.column.filter.enabled";
        // column filters are enabled by default
        public static final boolean COLUMN_FILTER_ENABLED_DEFAULT = true;

        /**
         * Get compression codec from configuration.
         * If option is not set, null value is returned.
         * @param conf configuration
         * @return compression codec short name
         */
        static String compressionCodecName(Configuration conf) {
            return conf.get(COMPRESSION_CODEC);
        }

        /**
         * Select next power of 2 as buffer size.
         * @param conf configuration
         * @return validated bytes value
         */
        static int power2BufferSize(Configuration conf) {
            int bytes = conf.getInt(BUFFER_SIZE, BUFFER_SIZE_DEFAULT);
            if (bytes > BUFFER_SIZE_MAX)
                return BUFFER_SIZE_MAX;
            if (bytes < BUFFER_SIZE_MIN)
                return BUFFER_SIZE_MIN;
            // bytes is already power of 2
            if ((bytes & (bytes - 1)) == 0)
                return bytes;
            bytes = Integer.highestOneBit(bytes) << 1;
            return (bytes < BUFFER_SIZE_MAX) ? bytes : BUFFER_SIZE_MAX;
        }

        /**
         * Select HDFS buffer size.
         * @param conf configuration
         * @return HDFS buffer size when open or create file
         */
        static int hdfsBufferSize(Configuration conf) {
            // bytes should be multiple of hardware pages 4096
            int pageSize = 4096;
            int bytes = conf.getInt(HDFS_BUFFER_SIZE, HDFS_BUFFER_SIZE_DEFAULT);
            if (bytes > HDFS_BUFFER_SIZE_DEFAULT && bytes % pageSize == 0)
                return bytes;
            // otherwise return default size
            return HDFS_BUFFER_SIZE_DEFAULT;
        }

        /**
         * Select positive number of rows in stripe.
         * @param conf configuration
         * @return number of rows in stripe, or throws exception if number is invalid
         */
        static int numRowsInStripe(Configuration conf) {
            int rows = conf.getInt(STRIPE_ROWS, STRIPE_ROWS_DEFAULT);
            // there should be positive number of rows in stripe
            if (rows < 1) {
                throw new IllegalArgumentException(
                        "Expected positive number of rows in stripe, found " + rows + " <= 0");
            }
            return rows;
        }

        /**
         * Select column filters (enable/disable).
         * @param conf configuration
         * @return true if column filters are enabled
         */
        static boolean columnFilterEnabled(Configuration conf) {
            return conf.getBoolean(COLUMN_FILTER_ENABLED, COLUMN_FILTER_ENABLED_DEFAULT);
        }
    }

    /**
     * Encode compression codec into byte flag.
     * @param codec compression codec, can be null
     * @return byte encoded flag
     */
    protected static byte encodeCompressionCodec(CompressionCodec codec) {
        return CompressionCodecFactory.encode(codec);
    }

    /**
     * Decode byte encoded flag into compression codec.
     * Compression codec can be null if flag is set to 0.
     * @param flag byte flag
     * @return compression codec or null for uncompressed stream
     */
    protected static CompressionCodec decodeCompressionCodec(byte flag) {
        return CompressionCodecFactory.decode(flag);
    }

    /**
     * Infer compression codec from file name.
     * @param path path to the file
     * @return compression codec or null for uncompressed
     */
    protected static CompressionCodec inferCompressionCodec(Path path) {
        String name = path.getName();
        int start = name.lastIndexOf('.');
        String ext = (start <= 0) ? "" : name.substring(start);
        return CompressionCodecFactory.forFileExt(ext);
    }

    private Riff() {
        /* no-op */ }

    //////////////////////////////////////////////////////////////
    // Public API for file writer
    //////////////////////////////////////////////////////////////

    /**
     * Get new writer.
     * Compression codec, if not set, is inferred from the file path.
     * @param fs file system to use
     * @param conf configuration with Riff options
     * @param path path to write
     * @param td type description (schema)
     * @return file writer
     */
    public static FileWriter writer(FileSystem fs, Configuration conf, Path path, TypeDescription td) {
        // check if compression codec is set in configuration, otherwise fall back to the inferring
        // codec from file extension
        CompressionCodec codec;
        if (Options.compressionCodecName(conf) != null) {
            codec = CompressionCodecFactory.forShortName(Options.compressionCodecName(conf));
        } else {
            codec = inferCompressionCodec(path);
        }
        try {
            return new FileWriter(fs, conf, path, td, codec);
        } catch (IOException err) {
            throw new RuntimeException("Error occured: " + err.getMessage(), err);
        }
    }

    /**
     * Get new writer.
     * Compression codec, if not set, is inferred from the file path.
     * @param conf configuration with Riff options
     * @param path path to write
     * @param td type description
     * @return file writer
     */
    public static FileWriter writer(Configuration conf, Path path, TypeDescription td) {
        try {
            return writer(path.getFileSystem(conf), conf, path, td);
        } catch (IOException err) {
            throw new RuntimeException("Error occured: " + err.getMessage(), err);
        }
    }

    /**
     * Get new writer.
     * Compression codec, if not set, is inferred from the file path.
     * @param path path to write
     * @param td type description
     * @return file writer
     */
    public static FileWriter writer(Path path, TypeDescription td) {
        return writer(new Configuration(), path, td);
    }

    //////////////////////////////////////////////////////////////
    // Public API for file reader
    //////////////////////////////////////////////////////////////

    /**
     * Get new reader.
     * @param fs file system to use
     * @param conf configuration with Riff options
     * @param path file path to read
     * @return file reader
     */
    public static FileReader reader(FileSystem fs, Configuration conf, Path path) {
        try {
            return new FileReader(fs, conf, path);
        } catch (IOException err) {
            throw new RuntimeException("Error occured: " + err.getMessage(), err);
        }
    }

    /**
     * Get new reader.
     * @param conf configuration with Riff options
     * @param path file path to read
     * @return file reader
     */
    public static FileReader reader(Configuration conf, Path path) {
        try {
            return reader(path.getFileSystem(conf), conf, path);
        } catch (IOException err) {
            throw new RuntimeException("Error occured: " + err.getMessage(), err);
        }
    }

    /**
     * Get new reader.
     * @param path file path to read
     * @return file reader
     */
    public static FileReader reader(Path path) {
        return reader(new Configuration(), path);
    }

    //////////////////////////////////////////////////////////////
    // Public API for metadata write/read
    //////////////////////////////////////////////////////////////

    /**
     * Get metadata reader.
     * @param fs file system to use
     * @param conf configuration with Riff options
     * @param metadataPath path to the metadata file or directory where metadata is stored
     */
    public static Metadata.MetadataReader metadataReader(FileSystem fs, Configuration conf, Path metadataPath) {
        return new Metadata.MetadataReader(fs, conf, metadataPath);
    }

    /**
     * Get metadata reader.
     * @param conf configuration with Riff options
     * @param metadataPath path to the metadata file or directory where metadata is stored
     */
    public static Metadata.MetadataReader metadataReader(Configuration conf, Path metadataPath) {
        try {
            return metadataReader(metadataPath.getFileSystem(conf), conf, metadataPath);
        } catch (IOException err) {
            throw new RuntimeException("Error occured: " + err.getMessage(), err);
        }
    }

    /**
     * Get metadata reader.
     * @param metadataPath path to the metadata file or directory where metadata is stored
     */
    public static Metadata.MetadataReader metadataReader(Path metadataPath) {
        return metadataReader(new Configuration(), metadataPath);
    }

    /**
     * Get metadata writer.
     * @param fs file system to use
     * @param conf hadoop configuration with riff settings
     * @param filepath filepath to a valid Riff file
     */
    public static Metadata.MetadataWriter metadataWriter(FileSystem fs, Configuration conf, Path filepath) {
        try {
            return new Metadata.MetadataWriter(fs, conf, filepath);
        } catch (IOException err) {
            throw new RuntimeException("Error occured: " + err.getMessage(), err);
        }
    }

    /**
     * Get metadata writer.
     * @param conf hadoop configuration with riff settings
     * @param filepath filepath to a valid Riff file
     */
    public static Metadata.MetadataWriter metadataWriter(Configuration conf, Path filepath) {
        try {
            return metadataWriter(filepath.getFileSystem(conf), conf, filepath);
        } catch (IOException err) {
            throw new RuntimeException("Error occured: " + err.getMessage(), err);
        }
    }

    /**
     * Get metadata writer.
     * @param filepath filepath to a valid Riff file
     */
    public static Metadata.MetadataWriter metadataWriter(Path filepath) {
        return metadataWriter(new Configuration(), filepath);
    }
}