com.facebook.hive.orc.ReaderImpl.java Source code

Java tutorial

Introduction

Here is the source code for com.facebook.hive.orc.ReaderImpl.java

Source

//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.facebook.hive.orc;

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import com.facebook.hive.orc.compression.CompressionCodec;
import com.facebook.hive.orc.compression.CompressionKind;
import com.facebook.hive.orc.statistics.ColumnStatistics;
import com.facebook.hive.orc.statistics.ColumnStatisticsImpl;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;

import com.facebook.hive.orc.lazy.OrcLazyRowObjectInspector;
import com.google.protobuf.CodedInputStream;
import org.apache.hadoop.io.IOUtils;

public final class ReaderImpl implements Reader {
    private static final Log LOG = LogFactory.getLog(ReaderImpl.class);
    private static final int DIRECTORY_SIZE_GUESS = 16 * 1024;

    private final FileSystem fileSystem;
    private final Path path;
    private final Configuration conf;
    private final CompressionKind compressionKind;
    private final CompressionCodec codec;
    private final int bufferSize;
    private final OrcProto.Footer footer;
    private final ObjectInspector inspector;

    private static class StripeInformationImpl implements StripeInformation {
        private final OrcProto.StripeInformation stripe;

        StripeInformationImpl(OrcProto.StripeInformation stripe) {
            this.stripe = stripe;
        }

        @Override
        public long getOffset() {
            return stripe.getOffset();
        }

        @Override
        public long getDataLength() {
            return stripe.getDataLength();
        }

        @Override
        public long getFooterLength() {
            return stripe.getFooterLength();
        }

        @Override
        public long getIndexLength() {
            return stripe.getIndexLength();
        }

        @Override
        public long getNumberOfRows() {
            return stripe.getNumberOfRows();
        }

        @Override
        public long getRawDataSize() {
            return stripe.getRawDataSize();
        }

        @Override
        public String toString() {
            return "offset: " + getOffset() + " data: " + getDataLength() + " rows: " + getNumberOfRows()
                    + " tail: " + getFooterLength() + " index: " + getIndexLength() + " raw_data: "
                    + getRawDataSize();
        }
    }

    @Override
    public long getNumberOfRows() {
        return footer.getNumberOfRows();
    }

    @Override
    public long getRawDataSize() {
        return footer.getRawDataSize();
    }

    @Override
    public Iterable<String> getMetadataKeys() {
        List<OrcProto.UserMetadataItem> metadata = footer.getMetadataList();
        List<String> result = new ArrayList<String>(metadata.size());

        for (OrcProto.UserMetadataItem item : metadata) {
            result.add(item.getName());
        }
        return result;
    }

    @Override
    public ByteBuffer getMetadataValue(String key) {
        for (OrcProto.UserMetadataItem item : footer.getMetadataList()) {
            if (item.hasName() && item.getName().equals(key)) {
                return item.getValue().asReadOnlyByteBuffer();
            }
        }
        throw new IllegalArgumentException("Can't find user metadata " + key);
    }

    @Override
    public CompressionKind getCompression() {
        return compressionKind;
    }

    @Override
    public int getCompressionSize() {
        return bufferSize;
    }

    @Override
    public Iterable<StripeInformation> getStripes() {
        return new Iterable<com.facebook.hive.orc.StripeInformation>() {

            @Override
            public Iterator<com.facebook.hive.orc.StripeInformation> iterator() {
                return new Iterator<com.facebook.hive.orc.StripeInformation>() {
                    private final Iterator<OrcProto.StripeInformation> inner = footer.getStripesList().iterator();

                    @Override
                    public boolean hasNext() {
                        return inner.hasNext();
                    }

                    @Override
                    public com.facebook.hive.orc.StripeInformation next() {
                        return new StripeInformationImpl(inner.next());
                    }

                    @Override
                    public void remove() {
                        throw new UnsupportedOperationException("remove unsupported");
                    }
                };
            }
        };
    }

    @Override
    public ObjectInspector getObjectInspector() {
        return inspector;
    }

    @Override
    public long getContentLength() {
        return footer.getContentLength();
    }

    @Override
    public List<OrcProto.Type> getTypes() {
        return footer.getTypesList();
    }

    @Override
    public int getRowIndexStride() {
        return footer.getRowIndexStride();
    }

    @Override
    public ColumnStatistics[] getStatistics() {
        ColumnStatistics[] result = new ColumnStatistics[footer.getTypesCount()];
        for (int i = 0; i < result.length; ++i) {
            result[i] = ColumnStatisticsImpl.deserialize(footer.getStatistics(i));
        }
        return result;
    }

    public ReaderImpl(FileSystem fs, Path path, Configuration conf) throws IOException {
        try {
            this.fileSystem = fs;
            this.path = path;
            this.conf = conf;
            FSDataInputStream file = fs.open(path);
            long size = fs.getFileStatus(path).getLen();
            int readSize = (int) Math.min(size, DIRECTORY_SIZE_GUESS);
            ByteBuffer buffer = ByteBuffer.allocate(readSize);
            InStream.read(file, size - readSize, buffer.array(), buffer.arrayOffset() + buffer.position(),
                    buffer.remaining());
            int psLen = buffer.get(readSize - 1);
            int psOffset = readSize - 1 - psLen;
            CodedInputStream in = CodedInputStream.newInstance(buffer.array(), buffer.arrayOffset() + psOffset,
                    psLen);
            OrcProto.PostScript ps = OrcProto.PostScript.parseFrom(in);
            int footerSize = (int) ps.getFooterLength();
            bufferSize = (int) ps.getCompressionBlockSize();
            switch (ps.getCompression()) {
            case NONE:
                compressionKind = CompressionKind.NONE;
                break;
            case ZLIB:
                compressionKind = CompressionKind.ZLIB;
                break;
            case SNAPPY:
                compressionKind = CompressionKind.SNAPPY;
                break;
            case LZO:
                compressionKind = CompressionKind.LZO;
                break;
            default:
                throw new IllegalArgumentException("Unknown compression");
            }
            codec = WriterImpl.createCodec(compressionKind);

            InputStream instream = InStream.create("footer", file, size - 1 - psLen - footerSize, footerSize, codec,
                    bufferSize);
            footer = OrcProto.Footer.parseFrom(instream);
            inspector = new OrcLazyRowObjectInspector(0, footer.getTypesList());
            file.close();
        } catch (IndexOutOfBoundsException e) {
            /**
             * When a non ORC file is read by ORC reader, we get IndexOutOfBoundsException exception while
             * creating a reader. Caught that exception and checked the file header to see if the input
             * file was ORC or not. If its not ORC, throw a NotAnORCFileException with the file
             * attempted to be reading (thus helping to figure out which table-partition was being read).
             */
            checkIfORC(fs, path);
            throw new IOException("Failed to create record reader for file " + path, e);
        } catch (IOException e) {
            throw new IOException("Failed to create record reader for file " + path, e);
        }
    }

    /**
     * Reads the file header (first 40 bytes) and checks if the first three characters are 'ORC'.
     */
    public static void checkIfORC(FileSystem fs, Path path) throws IOException {
        // hardcoded to 40 because "SEQ-org.apache.hadoop.hive.ql.io.RCFile", the header, is of 40 chars
        final int buffLen = 40;
        final byte header[] = new byte[buffLen];
        final FSDataInputStream file = fs.open(path);
        final long fileLength = fs.getFileStatus(path).getLen();
        int sizeToBeRead = buffLen;
        if (buffLen > fileLength) {
            sizeToBeRead = (int) fileLength;
        }

        IOUtils.readFully(file, header, 0, sizeToBeRead);
        file.close();

        final String headerString = new String(header);
        if (headerString.startsWith("ORC")) {
            LOG.error("Error while parsing the footer of the file : " + path);
        } else {
            throw new NotAnORCFileException("Input file = " + path + " , header = " + headerString);
        }
    }

    @Override
    public RecordReader rows(boolean[] include) throws IOException {
        return rows(0, Long.MAX_VALUE, include);
    }

    @Override
    public RecordReader rows(long offset, long length, boolean[] include) throws IOException {
        return new RecordReaderImpl(this.getStripes(), fileSystem, path, offset, length, footer.getTypesList(),
                codec, bufferSize, include, footer.getRowIndexStride(), conf);
    }

    @Override
    public StripeReader stripes(long offset, long length) throws IOException {
        return new StripeReader(this.getStripes(), fileSystem, path, offset, length);
    }

}