Source code

Java tutorial


Here is the source code for


* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements.  See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership.  The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.  You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied.  See the License for the
* specific language governing permissions and limitations
* under the License.

package org.apache.samza.system.hdfs.reader;


import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.AvroFSInput;
import org.apache.hadoop.fs.FileContext;
import org.apache.hadoop.fs.Path;
import org.apache.samza.SamzaException;
import org.apache.samza.system.IncomingMessageEnvelope;
import org.apache.samza.system.SystemStreamPartition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

 * An implementation of the HdfsReader that reads and processes avro format
 * files.
public class AvroFileHdfsReader implements SingleFileHdfsReader {

    private static final Logger LOG = LoggerFactory.getLogger(AvroFileHdfsReader.class);

    private final SystemStreamPartition systemStreamPartition;
    private DataFileReader<GenericRecord> fileReader;
    private long curBlockStart;
    private long curRecordOffset;

    public AvroFileHdfsReader(SystemStreamPartition systemStreamPartition) {
        this.systemStreamPartition = systemStreamPartition;
        this.fileReader = null;

    public void open(String pathStr, String singleFileOffset) {"%s: Open file [%s] with file offset [%s] for read", systemStreamPartition, pathStr,
        Path path = new Path(pathStr);
        try {
            AvroFSInput input = new AvroFSInput(FileContext.getFileContext(path.toUri()), path);
            fileReader = new DataFileReader<>(input, new GenericDatumReader<>());
        } catch (IOException e) {
            throw new SamzaException(e);

    public void seek(String singleFileOffset) {
        try {
            // See comments for AvroFileCheckpoint to understand the behavior below
            AvroFileCheckpoint checkpoint = new AvroFileCheckpoint(singleFileOffset);
            if (checkpoint.isStartingOffset()) {
                // seek to the beginning of the first block
                curBlockStart = fileReader.previousSync();
                curRecordOffset = 0;
            for (int i = 0; i < checkpoint.getRecordOffset(); i++) {
                if (fileReader.hasNext()) {
            curBlockStart = checkpoint.getBlockStart();
            curRecordOffset = checkpoint.getRecordOffset();
        } catch (IOException e) {
            throw new SamzaException(e);

    public IncomingMessageEnvelope readNext() {
        // get checkpoint for THIS record
        String checkpoint = nextOffset();
        GenericRecord record =;
        if (fileReader.previousSync() != curBlockStart) {
            curBlockStart = fileReader.previousSync();
            curRecordOffset = 0;
        } else {
        // avro schema doesn't necessarily have key field
        return new IncomingMessageEnvelope(systemStreamPartition, checkpoint, null, record);

    public boolean hasNext() {
        return fileReader.hasNext();

    public void close() {"About to close file reader for " + systemStreamPartition);
        try {
        } catch (IOException e) {
            throw new SamzaException(e);
        }"File reader closed for " + systemStreamPartition);

    public String nextOffset() {
        return AvroFileCheckpoint.generateCheckpointStr(curBlockStart, curRecordOffset);

    public static int offsetComparator(String offset1, String offset2) {
        AvroFileCheckpoint cp1 = new AvroFileCheckpoint(offset1);
        AvroFileCheckpoint cp2 = new AvroFileCheckpoint(offset2);
        return cp1.compareTo(cp2);

     * An avro file looks something like this:
     * Byte offset: 0       103            271         391
     *              ?
     * Avro file:    Header     Block 1     Block 2    Block 3   ...
     * Each block contains multiple records. The start of a block is defined as a valid
     * synchronization point. A file reader can only seek to a synchronization point, i.e.
     * the start of blocks. Thus, to precisely describe the location of a record, we need
     * to use the pair (blockStart, recordOffset). Here "blockStart" means the start of the
     * block and "recordOffset" means the index of the record within the block.
     * Take the example above, and suppose block 1 has 4 records, we have record sequences as:
     * (103, 0), (103, 1), (103, 2), (103, 3), (271, 0), ...
     * where (271, 0) represents the first event in block 2
     * With the CP_DELIM being '@', the actual checkpoint string would look like "103@1",
     * "271@0" or "271", etc. For convenience, a checkpoint with only the blockStart but no
     * recordOffset within the block simply means the first record in that block. Thus,
     * "271@0" is equal to "271".
    public static class AvroFileCheckpoint {
        private static final String CP_DELIM = "@";
        private long blockStart; // start position of the block
        private long recordOffset; // record offset within the block
        String checkpointStr;

        public static String generateCheckpointStr(long blockStart, long recordOffset) {
            return blockStart + CP_DELIM + recordOffset;

        public AvroFileCheckpoint(String checkpointStr) {
            String[] elements = checkpointStr.replaceAll("\\s", "").split(CP_DELIM);
            if (elements.length > 2 || elements.length < 1) {
                throw new SamzaException("Invalid checkpoint for AvroFileHdfsReader: " + checkpointStr);
            try {
                blockStart = Long.parseLong(elements[0]);
                recordOffset = elements.length == 2 ? Long.parseLong(elements[1]) : 0;
            } catch (NumberFormatException e) {
                throw new SamzaException("Invalid checkpoint for AvroFileHdfsReader: " + checkpointStr, e);
            this.checkpointStr = checkpointStr;

        public AvroFileCheckpoint(long blockStart, long recordOffset) {
            this.blockStart = blockStart;
            this.recordOffset = recordOffset;
            this.checkpointStr = generateCheckpointStr(blockStart, recordOffset);

        public long getBlockStart() {
            return blockStart;

        public long getRecordOffset() {
            return recordOffset;

        public String getCheckpointStr() {
            return checkpointStr;

        public boolean isStartingOffset() {
            return blockStart == 0;

        public int compareTo(AvroFileCheckpoint other) {
            if (this.blockStart < other.blockStart) {
                return -1;
            } else if (this.blockStart > other.blockStart) {
                return 1;
            } else
                return, other.recordOffset);

        public String toString() {
            return getCheckpointStr();