de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java Source code

Introduction

Here is the source code for de.l3s.streamcorpus.terrier.ThriftFileCollectionRecordReader.java
Source

/*******************************************************************************
 * Copyright 2012 Edgar Meij
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.l3s.streamcorpus.terrier;

import ilps.hadoop.StreamItemWritable;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.Seekable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CodecPool;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.CompressionCodecFactory;
import org.apache.hadoop.io.compress.Decompressor;
import org.apache.hadoop.io.compress.SplitCompressionInputStream;
import org.apache.hadoop.io.compress.SplittableCompressionCodec;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CompressedSplitLineReader;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.SplitLineReader;
import org.apache.commons.compress.compressors.xz.XZCompressorInputStream;
import org.apache.thrift.TException;
import org.apache.thrift.protocol.TBinaryProtocol;
import org.apache.thrift.protocol.TProtocol;
import org.apache.thrift.transport.TIOStreamTransport;
import org.apache.thrift.transport.TTransport;
import org.apache.thrift.transport.TTransportException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.terrier.utility.io.CountingInputStream;

/**
 * RecordReader that gets a file split containing urls of streamcorpus files (rather
 * than the files themselves) and emit the filename together with the stream items. 
 * 
 * @author tuan
 */
public class ThriftFileCollectionRecordReader extends RecordReader<Text, StreamItemWritable> {

    private static final Logger LOG = LoggerFactory.getLogger(ThriftFileCollectionRecordReader.class);

    // References to current input stream
    private FSDataInputStream fis;
    private BufferedInputStream bis;
    private XZCompressorInputStream xzis;
    private CountingInputStream cis;
    private TTransport transport;
    private TProtocol tp;

    private FileSystem fs;

    /** number of files obtained thus far by this record reader */
    protected int collectionIndex;

    private long start;
    private long length;
    private long position;
    private Text key = new Text();
    private StreamItemWritable value = new StreamItemWritable();
    private TBinaryProtocol.Factory factory;
    private Configuration conf;

    /** list of all paths */
    private List<String> paths;

    @Override
    public void close() throws IOException {
        if (transport != null)
            transport.close();
        if (cis != null) {
            cis.close();
            cis = null;
        }
        if (xzis != null)
            xzis.close();
        if (bis != null)
            bis.close();
        if (fis != null)
            fis.close();
    }

    /** Returns our progress within the split, as a float between 0 and 1. */
    @Override
    public float getProgress() {

        if (paths == null || paths.isEmpty()) {
            return 0.0f;
        }

        if (length == 0) {
            return ((float) collectionIndex) / paths.size();
        }

        float fileProgress = 0;
        if (fis != null && length != start)
            fileProgress = (float) position / (float) (length - start);
        return Math.min(1.0f, (fileProgress + (float) collectionIndex) / paths.size());
    }

    /** 
     * Read the urls / paths of files from the input file
     * */
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

        collectionIndex = -1;

        conf = context.getConfiguration();
        fs = FileSystem.get(conf);

        loadPathsFromInputSplit(split, conf);
    }

    /** 
     * Reading a bunch of lines of file paths in a list.
     * The code in this method is redistributed from Hadoop LineRecordReader
     * 
     * @throws IOException 
     */
    private void loadPathsFromInputSplit(InputSplit split, Configuration conf) throws IOException {
        FileSplit fileSplit = (FileSplit) split;
        Path path = fileSplit.getPath();

        long begin = fileSplit.getStart();
        long end = begin + fileSplit.getLength();

        LOG.info("Reading paths in file " + path.getName());

        // First check the compression codec
        CompressionCodecFactory compressionCodec = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodec.getCodec(path);
        FSDataInputStream fis = fs.open(path);
        SplitLineReader in;

        Seekable filePosition;

        boolean compressed = false;
        Decompressor decompressor = null;
        if (null != codec) {
            compressed = true;
            decompressor = CodecPool.getDecompressor(codec);
            if (codec instanceof SplittableCompressionCodec) {
                final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream(fis,
                        decompressor, begin, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
                in = new CompressedSplitLineReader(cIn, conf, (byte[]) null);
                begin = cIn.getAdjustedStart();
                end = cIn.getAdjustedEnd();
                filePosition = cIn;
            } else {
                in = new SplitLineReader(codec.createInputStream(fis, decompressor), conf, null);
                filePosition = fis;
            }
        } else {
            fis.seek(begin);
            in = new SplitLineReader(fis, conf, (byte[]) null);
            filePosition = fis;
        }
        // If this is not the first split, we always throw away first record
        // because we always (except the last split) read one extra line in
        // next() method.
        if (begin != 0) {
            begin += in.readLine(new Text(), 0, maxBytesToConsume(compressed, begin, end));
        }
        long pos = begin;

        int newSize = 0;
        final Text nextLine = new Text();
        paths = new ArrayList<>();
        while (getFilePosition(compressed, filePosition, pos) <= end || in.needAdditionalRecordAfterSplit()) {

            if (pos == 0) {
                // Strip BOM(Byte Order Mark)
                // Text only support UTF-8, we only need to check UTF-8 BOM
                // (0xEF,0xBB,0xBF) at the start of the text stream.
                newSize = in.readLine(nextLine, Integer.MAX_VALUE, Integer.MAX_VALUE);
                pos += newSize;
                int textLength = nextLine.getLength();
                byte[] textBytes = nextLine.getBytes();
                if ((textLength >= 3) && (textBytes[0] == (byte) 0xEF) && (textBytes[1] == (byte) 0xBB)
                        && (textBytes[2] == (byte) 0xBF)) {
                    // find UTF-8 BOM, strip it.
                    LOG.info("Found UTF-8 BOM and skipped it");
                    textLength -= 3;
                    newSize -= 3;
                    if (textLength > 0) {
                        // It may work to use the same buffer and 
                        // not do the copyBytes
                        textBytes = nextLine.copyBytes();
                        nextLine.set(textBytes, 3, textLength);
                    } else {
                        nextLine.clear();
                    }
                }
            } else {
                newSize = in.readLine(nextLine, Integer.MAX_VALUE, maxBytesToConsume(compressed, pos, end));
                pos += newSize;
            }

            paths.add(nextLine.toString());
            // line too long. try again
            LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
        }

        try {
            if (in != null) {
                in.close();
            }
            if (fis != null) {
                fis.close();
            }
        } finally {
            if (decompressor != null) {
                CodecPool.returnDecompressor(decompressor);
            }
        }
    }

    private static int maxBytesToConsume(boolean isCompressedInput, long pos, long end) {
        return isCompressedInput ? Integer.MAX_VALUE
                : (int) Math.max(Math.min(Integer.MAX_VALUE, end - pos), Integer.MAX_VALUE);
    }

    private static long getFilePosition(boolean isCompressedInput, Seekable filePosition, long pos)
            throws IOException {
        long retVal;
        if (isCompressedInput && null != filePosition) {
            retVal = filePosition.getPos();
        } else {
            retVal = pos;
        }
        return retVal;
    }

    /** Open the next file in the collections 
     * @throws IOException */
    private boolean getNextFile() throws IOException {

        if (paths == null || paths.isEmpty()) {
            return false;
        }

        // close the current file
        close();

        // no more paths to process
        if (++collectionIndex >= paths.size()) {
            return false;
        }

        Path path = new Path(paths.get(collectionIndex));
        // Some files are corrupted, report them and move on
        try {
            fis = fs.open(path);
            bis = new BufferedInputStream(fis);

            if (paths.get(collectionIndex).endsWith(".xz")) {
                xzis = new XZCompressorInputStream(bis);
                cis = new CountingInputStream(xzis);
            } else {
                xzis = null;
                cis = new CountingInputStream(bis);
            }
            transport = new TIOStreamTransport(cis);
            position = start = cis.getPos();
            length = fs.getFileStatus(path).getLen();

        } catch (IOException e) {
            LOG.error("Bad file: ", path.toString());
            e.printStackTrace();
        }

        try {
            if (transport != null)
                transport.open();

            // Skip this file
            else {
                return getNextFile();
            }
        } catch (TTransportException e) {
            e.printStackTrace();
            throw new IOException(e);
        }

        factory = new TBinaryProtocol.Factory();
        tp = factory.getProtocol(transport);
        value = new StreamItemWritable(factory);

        return true;
    }

    @Override
    /**
     * parse the next key value, update position and return true
     */
    public boolean nextKeyValue() throws IOException, InterruptedException {

        while (true) {

            // The file is corrupted, skip it
            if (cis == null) {
                if (!getNextFile()) {
                    return false;
                } else
                    continue;
            }

            key.set(paths.get(collectionIndex));

            // assume the underlying file is opened, read and when the EOF
            // is met, move to the next file
            if (cis.available() > 0) {
                try {
                    value.read(tp);
                    position = cis.getPos();
                    return true;
                } catch (TTransportException e) {
                    int type = e.getType();
                    if (type == TTransportException.END_OF_FILE) {
                        if (!getNextFile()) {
                            return false;
                        } else
                            continue;
                    }
                } catch (TException e) {
                    e.printStackTrace();
                    throw new IOException(e);
                }
            } else {
                if (!getNextFile()) {
                    return false;
                } else
                    continue;
            }
        }
    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    @Override
    public StreamItemWritable getCurrentValue() throws IOException, InterruptedException {
        return value;
    }
}