com.github.sadikovi.riff.FileWriter.java Source code

Introduction

Here is the source code for com.github.sadikovi.riff.FileWriter.java
Source

/*
 * Copyright (c) 2017 sadikovi
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */

package com.github.sadikovi.riff;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileAlreadyExistsException;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;

import org.apache.spark.sql.catalyst.InternalRow;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.github.sadikovi.riff.column.ColumnFilter;
import com.github.sadikovi.riff.io.CompressionCodec;
import com.github.sadikovi.riff.io.OutputBuffer;
import com.github.sadikovi.riff.io.OutStream;
import com.github.sadikovi.riff.io.StripeOutputBuffer;
import com.github.sadikovi.riff.stats.Statistics;

/**
 * File writer provides methods to prepare file and write rows. File consists of three regions:
 * static metadata (FileHeader), main data in stripes, and dynamic metadata (FileFooter). Static
 * metadata includes type description and state, dynamic metadata includes statistics and stripe
 * information.
 * This class should be used per thread, is not thread-safe.
 *
 * Usage:
 * {{{
 * Iterator<InternalRow> rows = ...;
 * writer.prepareWrite();
 * while (rows.hasNext()) {
 *   writer.write(rows.next());
 * }
 * writer.finishWrite();
 * }}}
 *
 * Writer should be used only to create file once, reuses are not allowed - create a new
 * instance instead. Multiple calls of `prepareWrite` are allowed and result in no-op, the same
 * goes for `finishWrite` method. When `finishWrite` is called, writer flushes the last stripe and
 * finalizes riff file.
 */
public class FileWriter {
    private static final Logger LOG = LoggerFactory.getLogger(FileWriter.class);

    // file system to use for writing a file
    private final FileSystem fs;
    // resolved path for final Riff file
    private final Path filePath;
    // type description for schema to write
    private final TypeDescription td;
    // number of rows in stripe
    private final int numRowsInStripe;
    // buffer size for outstream
    private final int bufferSize;
    // HDFS buffer size for creating stream
    private final int hdfsBufferSize;
    // whether or not column filters are enabled
    private final boolean columnFilterEnabled;
    // compression codec, can be null
    private final CompressionCodec codec;

    // write has been prepared
    private boolean writePrepared;
    // write has been finished
    private boolean writeFinished;
    // file output stream
    private FSDataOutputStream outputStream;
    // custom file properties, written as part of header
    private HashMap<String, String> fileProperties;
    // stripe id (incremented for each stripe)
    private short stripeId;
    // current position in the stream
    private long currentOffset;
    // all stripes in a data file
    private ArrayList<StripeInformation> stripes;
    // record writer
    private IndexedRowWriter recordWriter;
    // current stripe output buffer
    private StripeOutputBuffer stripe;
    // current stripe out stream
    private OutStream stripeStream;
    // number of records in current stripe
    private int stripeCurrentRecords;
    // total number of records in file
    private int totalRecords;
    // statistics per stripe
    private Statistics[] stripeStats;
    // column filters per stripe
    private ColumnFilter[] stripeFilters;

    /**
     * Create file writer for path.
     * Configuration is passed separately and not reused from `fs.getConf`. This is to be explicit
     * about separate configuration from most of the hadoop settings. Actual user-facing API will
     * allow providing configuration for both file system and internal options.
     * @param fs file system to use
     * @param conf configuration
     * @param path path to the header file, also used to create data path
     * @param td type description for rows
     * @param codec compression codec
     * @throws IOException
     * @throws FileAlreadyExistsException
     */
    FileWriter(FileSystem fs, Configuration conf, Path path, TypeDescription td, CompressionCodec codec)
            throws IOException {
        this.fs = fs;
        this.filePath = fs.makeQualified(path);
        this.writePrepared = false;
        this.writeFinished = false;
        if (this.fs.exists(filePath)) {
            throw new FileAlreadyExistsException("Already exists: " + filePath);
        }
        // this assumes that subsequent rows are provided for this schema
        this.td = td;
        this.numRowsInStripe = Riff.Options.numRowsInStripe(conf);
        this.bufferSize = Riff.Options.power2BufferSize(conf);
        this.hdfsBufferSize = Riff.Options.hdfsBufferSize(conf);
        this.columnFilterEnabled = Riff.Options.columnFilterEnabled(conf);
        this.codec = codec;
        // current stripe stats and filters
        this.stripeStats = null;
        this.stripeFilters = null;
        // file properties, by default not initialized
        this.fileProperties = null;
    }

    /**
     * Return file path for writer.
     * @return qualified file path
     */
    public Path filePath() {
        return filePath;
    }

    /**
     * Number of rows in stripe for this writer.
     * @return positive number of rows
     */
    public int numRowsInStripe() {
        return numRowsInStripe;
    }

    /**
     * Return selected buffer size this is used for outstream instances.
     * @return buffer size as power of 2
     */
    public int bufferSize() {
        return bufferSize;
    }

    /**
     * Get current compression codec.
     * @return compression codec or null for uncompressed
     */
    public CompressionCodec codec() {
        return codec;
    }

    /**
     * Set custom file property to be written as part of file header.
     * Must be called before 'prepareWrite()' method.
     * @param key non-null key
     * @param value non-null value for the key
     */
    public void setFileProperty(String key, String value) {
        if (writePrepared || writeFinished) {
            throw new IllegalStateException("Cannot set property on already prepared/finished file. "
                    + "Method 'setFileProperty()' should be called before 'prepareWrite()'");
        }
        if (key == null || value == null) {
            throw new IllegalArgumentException("Null key/value: key=" + key + ", value=" + value);
        }
        if (fileProperties == null) {
            fileProperties = new HashMap<String, String>();
        }
        fileProperties.put(key, value);
    }

    /**
     * Prepare writer. This method initializes stripes, statistics and counters.
     * @throws IOException
     */
    public void prepareWrite() throws IOException {
        if (writeFinished)
            throw new IOException("Writer reuse");
        if (writePrepared)
            return;
        LOG.debug("Prepare file writer {}", this);
        stripeId = 0;
        currentOffset = 0L;
        totalRecords = 0;
        stripes = new ArrayList<StripeInformation>();
        recordWriter = new IndexedRowWriter(td);
        LOG.debug("Initialized record writer {}", recordWriter);
        // initialize stripe related parameters
        stripe = new StripeOutputBuffer(stripeId++);
        stripeStream = new OutStream(bufferSize, codec, stripe);
        stripeStats = createStatistics(td);
        stripeFilters = createColumnFilters(td, columnFilterEnabled, numRowsInStripe);
        stripeCurrentRecords = numRowsInStripe;
        LOG.debug("Initialize stripe outstream {}", stripeStream);
        // create stream for riff file and write header information
        FileHeader fileHeader = new FileHeader(td, fileProperties);
        fileHeader.setState(0, Riff.encodeCompressionCodec(codec));
        try {
            outputStream = fs.create(filePath, false, hdfsBufferSize);
            LOG.debug("Write file header");
            fileHeader.writeTo(outputStream);
        } catch (IOException ioe) {
            if (outputStream != null) {
                outputStream.close();
            }
            throw ioe;
        }
        // mark as initialized
        writePrepared = true;
    }

    /**
     * Write internal row using this writer.
     * Row should confirm to the type description.
     * @param row internal row
     * @throws IOException
     */
    public void write(InternalRow row) throws IOException {
        try {
            if (stripeCurrentRecords == 0) {
                // flush data into stripe buffer
                stripeStream.flush();
                // write stripe information into output, such as stripe id and length, and capture position
                StripeInformation stripeInfo = new StripeInformation(stripe, currentOffset, stripeStats,
                        stripeFilters);
                currentOffset += stripeInfo.length();
                // written numRowsInStripe records
                totalRecords += numRowsInStripe;
                stripe.flush(outputStream);
                LOG.debug("Finished writing stripe {}, records={}", stripeInfo, numRowsInStripe);
                stripes.add(stripeInfo);
                stripe = new StripeOutputBuffer(stripeId++);
                stripeStream = new OutStream(bufferSize, codec, stripe);
                stripeCurrentRecords = numRowsInStripe;
                stripeStats = createStatistics(td);
            }
            updateStatistics(stripeStats, td, row);
            updateColumnFilters(stripeFilters, td, row);
            recordWriter.writeRow(row, stripeStream);
            stripeCurrentRecords--;
        } catch (IOException ioe) {
            if (outputStream != null) {
                outputStream.close();
            }
            throw ioe;
        }
    }

    /**
     * Finish writes.
     * All buffers are flushed at the end of this operation and streams are closed.
     * @throws IOException
     */
    public void finishWrite() throws IOException {
        if (!writePrepared)
            throw new IOException("Writer is not prepared");
        if (writeFinished)
            return;
        try {
            // flush the last stripe into output stream
            stripeStream.flush();
            StripeInformation stripeInfo = new StripeInformation(stripe, currentOffset, stripeStats, stripeFilters);
            stripe.flush(outputStream);
            LOG.debug("Finished writing stripe {}, records={}", stripeInfo, numRowsInStripe - stripeCurrentRecords);
            // update total records with delta
            totalRecords += numRowsInStripe - stripeCurrentRecords;
            stripes.add(stripeInfo);
            stripeStream.close();
            stripe = null;
            stripeStream = null;
            stripeStats = null;

            LOG.debug("Merge stripe statistics");
            // combine all statistics for a file
            Statistics[] fileStats = createStatistics(td);
            for (StripeInformation info : stripes) {
                if (info.hasStatistics()) {
                    for (int i = 0; i < fileStats.length; i++) {
                        fileStats[i].merge(info.getStatistics()[i]);
                    }
                }
            }
            LOG.debug("Write file footer");
            FileFooter fileFooter = FileFooter.create(fileStats, totalRecords, stripes);
            fileFooter.writeTo(outputStream);
        } finally {
            // close file stream
            if (outputStream != null) {
                outputStream.close();
            }
            // release codec resources
            if (codec != null) {
                codec.close();
            }
        }
        LOG.info("Finished writing file {}", filePath);
        writeFinished = true;
    }

    /**
     * Create new array of statistics for a stripe.
     * @return statistics
     */
    private static Statistics[] createStatistics(TypeDescription td) {
        Statistics[] stats = new Statistics[td.fields().length];
        for (int i = 0; i < td.fields().length; i++) {
            stats[i] = Statistics.sqlTypeToStatistics(td.fields()[i].dataType());
        }
        return stats;
    }

    /**
     * Update each instance of statistics with value of internal row.
     * @param stats list of statistics
     * @param td type description for a row
     * @param row row to use for updates, should match type description
     */
    private static void updateStatistics(Statistics[] stats, TypeDescription td, InternalRow row) {
        for (int i = 0; i < stats.length; i++) {
            // ordinal is original sql position in struct type for internal row, not index of type spec
            stats[i].update(row, td.atPosition(i).origSQLPos());
        }
    }

    /**
     * Create new array of column filters for a stripe.
     * Returns null if column filters are disabled.
     * @param td type description
     * @param enabled true if column filters are enabled
     * @param stripeRows expected number of rows in stripe
     * @return array of column filters or null if filters are disabled
     */
    private static ColumnFilter[] createColumnFilters(TypeDescription td, boolean enabled, int stripeRows) {
        // column filters are created only if enabled and type description contains any indexed fields
        if (!enabled || td.indexFields().length == 0)
            return null;
        ColumnFilter[] filters = new ColumnFilter[td.fields().length];
        for (int i = 0; i < td.fields().length; i++) {
            if (td.fields()[i].isIndexed()) {
                filters[i] = ColumnFilter.sqlTypeToColumnFilter(td.fields()[i].dataType(), stripeRows);
            } else {
                filters[i] = ColumnFilter.noopFilter();
            }
        }
        return filters;
    }

    /**
     * Update each instance of column filters with value of internal row.
     * @param filters array of filters, can be null
     * @param td type description
     * @param row row to use for updates
     */
    private static void updateColumnFilters(ColumnFilter[] filters, TypeDescription td, InternalRow row) {
        if (filters == null)
            return;
        for (int i = 0; i < filters.length; i++) {
            filters[i].update(row, td.atPosition(i).origSQLPos());
        }
    }

    @Override
    public String toString() {
        return "FileWriter[" + "path=" + filePath + ", type_desc=" + td + ", rows_per_stripe=" + numRowsInStripe
                + ", is_compressed=" + (codec != null) + ", buffer_size=" + bufferSize + ", hdfs_buffer_size="
                + hdfsBufferSize + ", column_filter_enabled=" + columnFilterEnabled + "]";
    }
}