com.blackberry.logdriver.mapred.avro.AvroBlockRecordReader.java Source code

Java tutorial

Introduction

Here is the source code for com.blackberry.logdriver.mapred.avro.AvroBlockRecordReader.java

Source

/** Copyright (c) 2014 BlackBerry Limited
 *
 *  Licensed under the Apache License, Version 2.0 (the "License");
 *  you may not use this file except in compliance with the License.
 *  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *  Unless required by applicable law or agreed to in writing, software
 *  distributed under the License is distributed on an "AS IS" BASIS,
 *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *  See the License for the specific language governing permissions and
 *  limitations under the License. 
 */

package com.blackberry.logdriver.mapred.avro;

import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.ArrayDeque;
import java.util.Arrays;
import java.util.Deque;
import java.util.Iterator;

import org.apache.avro.file.DataFileConstants;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.lib.CombineFileSplit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.blackberry.logdriver.avro.AvroFileHeader;
import com.blackberry.logdriver.avro.AvroUtils;

public class AvroBlockRecordReader implements RecordReader<AvroFileHeader, BytesWritable> {
    private static final Logger LOG = LoggerFactory.getLogger(AvroBlockRecordReader.class);

    @SuppressWarnings("unused")
    private static final BytesWritable EMPTY_BYTES = new BytesWritable();

    private FileSystem fs = null;
    private FSDataInputStream in = null;

    private CombineFileSplit split = null;
    private int currentFile = -1;

    private long start = 0;
    private long end = 0;
    private long pos = 0;

    private AvroFileHeader header = null;

    public AvroBlockRecordReader(InputSplit split, JobConf job) throws IOException {
        this.split = (CombineFileSplit) split;
        fs = FileSystem.get(job);
    }

    private void advanceToSyncMarker(FSDataInputStream in, byte[] syncMarker) throws IOException {
        byte b = 0;
        int bytesRead = 0;
        byte[] sync = header.getSyncMarker();
        Iterator<Byte> iterator = null;
        boolean match = true;

        Deque<Byte> deque = new ArrayDeque<Byte>(DataFileConstants.SYNC_SIZE);
        while (true) {
            b = in.readByte();
            deque.add(b);
            bytesRead++;

            match = true;
            if (deque.size() == DataFileConstants.SYNC_SIZE) {
                match = true;
                iterator = deque.iterator();
                for (int i = 0; i < DataFileConstants.SYNC_SIZE; i++) {
                    if (sync[i] != iterator.next()) {
                        match = false;
                        break;
                    }
                }

                if (match) {
                    break;
                }

                deque.remove();
            }
        }

        pos = start + bytesRead;
        LOG.info("Found sync marker at {}", pos - 16);
        in.seek(pos);
    }

    private void initCurrentFile() throws IOException {
        LOG.info("Initializing {}:{}+{}", new Object[] { split.getPath(currentFile), split.getOffset(currentFile),
                split.getLength(currentFile) });
        start = split.getOffset(currentFile);
        end = start + split.getLength(currentFile);

        // Open the file.
        in = fs.open(split.getPath(currentFile));

        // Read the header, validate it, and save it.
        // If this is a zero length split, then don't try to read the header. The
        // bext call to next() will just cause it to skip to the next file, no harm
        // done.
        if (start == end) {
            header = new AvroFileHeader();
            pos = start;
        } else {
            header = AvroFileHeader.readHeader(in);

            // Seek to the start of the split
            in.seek(start);

            // Seek to the next sync marker
            advanceToSyncMarker(in, header.getSyncMarker());

            pos = in.getPos();
        }
    }

    @Override
    public void close() throws IOException {
        if (in != null) {
            in.close();
        }
    }

    @Override
    public float getProgress() throws IOException {
        // The fraction of files read, plus where we are within that file
        long currentPosition = 0;
        for (int i = 0; i < currentFile && i < split.getNumPaths(); i++) {
            currentPosition += split.getLength(i);
        }
        if (currentFile < split.getNumPaths()) {
            currentPosition += getPos() - split.getOffset(currentFile);
        }

        return (float) (1.0 * currentPosition / split.getLength());
    }

    @Override
    public boolean next(AvroFileHeader key, BytesWritable value) throws IOException {
        while (pos >= end) {
            if (in != null) {
                in.close();
            }
            currentFile++;
            if (split.getNumPaths() > currentFile) {
                try {
                    initCurrentFile();
                } catch (IOException e) {
                    LOG.error("Error reading Boom file header.  Skipping this file.", e);
                    return false;
                }
            } else {
                return false;
            }
        }

        key.set(header);

        // Get the number of entries in the next block
        int entries = AvroUtils.readInt(in);
        byte[] block = null;
        try {
            block = AvroUtils.readBytes(in);
        } catch (IOException e) {
            LOG.error("Error reading block from file.  Skipping the rest of this file.", e);
            return false;
        }

        // Check that the sync marker is what we expect
        LOG.trace("Verifying sync marker");
        byte[] syncMarker = null;
        try {
            syncMarker = AvroUtils.readBytes(in, DataFileConstants.SYNC_SIZE);
        } catch (IOException e) {
            LOG.error("Error reading sync marker from file.  Skipping the rest of this file.", e);
            return false;
        }
        if (!Arrays.equals(syncMarker, header.getSyncMarker())) {
            LOG.error("Sync marker does not match");
            return false;
        }

        // Now, pack it all back into a byte[], and set the value of value
        {
            ByteBuffer bb = ByteBuffer.allocate(10 + 10 + block.length);
            bb.put(AvroUtils.encodeLong(entries));
            bb.put(AvroUtils.encodeLong(block.length));
            bb.put(block);
            byte[] result = new byte[bb.position()];
            bb.rewind();
            bb.get(result);
            value.set(result, 0, result.length);

            pos = in.getPos();
        }

        return true;
    }

    @Override
    public AvroFileHeader createKey() {
        return new AvroFileHeader();
    }

    @Override
    public BytesWritable createValue() {
        return new BytesWritable();
    }

    @Override
    public long getPos() throws IOException {
        return pos;
    }
}