org.ojai.json.mapreduce.JSONFileRecordReader.java Source code

Introduction

Here is the source code for org.ojai.json.mapreduce.JSONFileRecordReader.java
Source

/**
 * Copyright (c) 2015 MapR, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.ojai.json.mapreduce;

import java.io.IOException;
import java.io.InputStreamReader;
import java.util.Iterator;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.ojai.Document;
import org.ojai.DocumentStream;
import org.ojai.json.Json;

public class JSONFileRecordReader extends RecordReader<LongWritable, Document> {

    private FSDataInputStream inputStream;
    private DocumentStream<Document> documentStream;
    private Iterator<Document> it;
    private long documentCount;
    private LongWritable key = null;
    private Document document;
    private long currentPos;
    private long start;
    private long end;
    private long blockLength;

    @Override
    public void close() throws IOException {
        try {
            documentStream.close();
        } catch (Exception e) {
            throw new IOException("Error closing document Stream in JsonFileRecordReader");
        }
        if (inputStream != null) {
            inputStream.close();
        }
    }

    @Override
    public LongWritable getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public Document getCurrentValue() throws IOException, InterruptedException {
        return document;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        if (start == end) {
            return 0.0F;
        }
        return Math.min(1.0F, (float) (currentPos - start) / (float) (end - start));
    }

    private long bytesToSkip(long start, long blockLength) throws IOException {
        long toSkip = 0;
        inputStream.seek(start - 1);

        //create InputStreamReader
        InputStreamReader in = new InputStreamReader(inputStream, "UTF-8");
        boolean gotStart = false;
        char curChar;
        while (toSkip <= blockLength) {
            curChar = (char) in.read();
            if (curChar == '}') {
                gotStart = true;
            }
            if (curChar == '{') {
                if (gotStart) {
                    break;
                }
            }
            if (curChar == ',') {
                gotStart = false;
            }
            toSkip += 1;
        }

        return toSkip;
    }

    @Override
    public void initialize(InputSplit arg0, TaskAttemptContext taskContext)
            throws IOException, InterruptedException {

        documentStream = null;
        it = null;
        documentCount = 0;
        key = new LongWritable();
        document = null;
        currentPos = 0;

        /* get the split */
        FileSplit split = (FileSplit) arg0;

        /* get configuration object */
        Configuration job = taskContext.getConfiguration();

        /* initialize file /input stream */
        final Path path = split.getPath();
        FileSystem fs = path.getFileSystem(job);
        inputStream = fs.open(path);

        /*
         * if this block is not the first block check if it falls on document
         * boundary. If not, skip bytes to start to the next document boundary.
         */
        start = split.getStart();
        blockLength = split.getLength();
        long skipBytes = 0;

        if (start != 0) {
            /*
             * not the first block check if it starts on a document boundary
             */
            skipBytes = bytesToSkip(start, blockLength);
            currentPos = start - 1 + skipBytes;
            inputStream.seek(currentPos);
        }

        /* Initialize a stream reader so that it can read multiple documents from */
        /* the file */

        documentStream = Json.newDocumentStream(inputStream);
        it = documentStream.iterator();

    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        boolean hasNextKeyVal = false;

        long thisPos = inputStream.getPos();
        if (thisPos >= (start + blockLength)) {
            return false;
        }

        if (it.hasNext()) {
            key.set(documentCount);
            document = it.next();
            documentCount++;
            hasNextKeyVal = true;
        }

        return hasNextKeyVal;
    }

}