com.ashishpaliwal.hadoop.utils.inputformat.CsvLineReader.java Source code

Java tutorial

Introduction

Here is the source code for com.ashishpaliwal.hadoop.utils.inputformat.CsvLineReader.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

package com.ashishpaliwal.hadoop.utils.inputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;

import java.io.IOException;
import java.io.InputStream;

/**
 * CSV Line reader
 */
public class CsvLineReader {

    // Deafult buffer size as 32K
    private static final int DEFAULT_BUFFER_SIZE = 32 * 1024;
    private int bufferSize = DEFAULT_BUFFER_SIZE;
    private InputStream in;
    private byte[] buffer;
    private int bufferLength = 0;
    private int bufferPosn = 0;

    /**
     * Create a csv line reader that reads from the given stream using the
     * default buffer-size (64k).
     *
     * @param in The input stream
     */
    public CsvLineReader(InputStream in) {
        this(in, DEFAULT_BUFFER_SIZE);
    }

    /**
     * Create a csv line reader that reads from the given stream using the
     * given buffer-size.
     *
     * @param in         The input stream
     * @param bufferSize Size of the read buffer
     */
    public CsvLineReader(InputStream in, int bufferSize) {
        this.in = in;
        this.bufferSize = bufferSize;
        this.buffer = new byte[this.bufferSize];
    }

    /**
     * Create a line reader that reads from the given stream using the
     * <code>io.file.buffer.size</code> specified in the given
     * <code>Configuration</code>.
     *
     * @param in   input stream
     * @param conf configuration
     * @throws IOException
     */
    public CsvLineReader(InputStream in, Configuration conf) throws IOException {
        this(in, conf.getInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE));
    }

    /**
     * Fill the buffer with more data.
     *
     * @return was there more data?
     * @throws IOException
     */
    boolean backfill() throws IOException {
        bufferPosn = 0;
        bufferLength = in.read(buffer);
        return bufferLength > 0;
    }

    /**
     * Close the underlying stream.
     *
     * @throws IOException
     */
    public void close() throws IOException {
        in.close();
    }

    /**
     * Read from the InputStream into the given Text.
     *
     * @param txt               the object to store the given line
     * @param maxLineLength     the maximum number of bytes to store into txt.
     * @param maxBytesToConsume the maximum number of bytes to consume in this
     *                          call.
     * @return the number of bytes read including the newline
     * @throws IOException if the underlying stream throws
     */
    public int readLine(Text txt, int maxLineLength, int maxBytesToConsume) throws IOException {
        txt.clear();
        boolean hadFinalNewline = false;
        boolean hadFinalReturn = false;
        boolean hitEndOfFile = false;
        int startPosn = bufferPosn;
        long bytesConsumed = 0;
        boolean inQuote = false;
        boolean isLastCharEscapeChar = false;

        outerLoop: while (true) {
            if (bufferPosn >= bufferLength) {
                if (!backfill()) {
                    hitEndOfFile = true;
                    break;
                }
            }

            startPosn = bufferPosn;

            for (; bufferPosn < bufferLength; ++bufferPosn) {

                switch (buffer[bufferPosn]) {

                case '\\':
                    isLastCharEscapeChar = !isLastCharEscapeChar;
                    break;

                case '"':
                    if (!inQuote && hadFinalReturn) {
                        break outerLoop;
                    }

                    if (!isLastCharEscapeChar) {
                        inQuote = !inQuote;
                    }
                    isLastCharEscapeChar = false;
                    break;

                case '\n':
                    isLastCharEscapeChar = false;
                    if (!inQuote) {
                        hadFinalNewline = true;
                        bufferPosn += 1;
                        break outerLoop;
                    }
                    break;

                case '\r':
                    isLastCharEscapeChar = false;
                    if (!inQuote) {
                        if (hadFinalReturn) {
                            // leave this \r in the stream, so we'll get it next time
                            break outerLoop;
                        }
                        hadFinalReturn = true;
                    }
                    break;

                default:
                    isLastCharEscapeChar = false;
                    if (!inQuote && hadFinalReturn) {
                        break outerLoop;
                    }
                }
            }

            bytesConsumed += bufferPosn - startPosn;
            int length = bufferPosn - startPosn - (hadFinalReturn ? 1 : 0);
            length = Math.min(length, maxLineLength - txt.getLength());

            if (length >= 0)
                txt.append(buffer, startPosn, length);

            if (bytesConsumed >= maxBytesToConsume)
                return (int) Math.min(bytesConsumed, (long) Integer.MAX_VALUE);
        }

        int newlineLength = (hadFinalNewline ? 1 : 0) + (hadFinalReturn ? 1 : 0);

        if (!hitEndOfFile) {
            bytesConsumed += bufferPosn - startPosn;
            int length = bufferPosn - startPosn - newlineLength;
            length = Math.min(length, maxLineLength - txt.getLength());

            if (length > 0)
                txt.append(buffer, startPosn, length);
        }
        return (int) Math.min(bytesConsumed, (long) Integer.MAX_VALUE);
    }

    /**
     * Read from the InputStream into the given Text.
     *
     * @param txt           the object to store the given line
     * @param maxLineLength the maximum number of bytes to store into txt.
     * @return the number of bytes read including the newline
     * @throws IOException if the underlying txteam throws
     */
    public int readLine(Text txt, int maxLineLength) throws IOException {
        return readLine(txt, maxLineLength, Integer.MAX_VALUE);
    }

    /**
     * Read from the InputStream into the given Text.
     *
     * @param txt the object to store the given line
     * @return the number of bytes read including the newline
     * @throws IOException if the underlying stream throws
     */
    public int readLine(Text txt) throws IOException {
        return readLine(txt, Integer.MAX_VALUE, Integer.MAX_VALUE);
    }
}