org.commoncrawl.util.TimeSeriesDataFile.java Source code

Java tutorial

Introduction

Here is the source code for org.commoncrawl.util.TimeSeriesDataFile.java

Source

/**
 * Copyright 2008 - CommonCrawl Foundation
 * 
 *    This program is free software: you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation, either version 3 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 **/

package org.commoncrawl.util;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.zip.CRC32;

import org.apache.hadoop.io.DataInputBuffer;
import org.apache.hadoop.io.DataOutputBuffer;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.record.Buffer;

/** quick and dirty (for now) way to write writable records to a local disk file 
 * 
 * @author rana
 *
 */
public class TimeSeriesDataFile<ValueType extends Writable> {

    private static final int SyncBytes = 0xCC00CC00;
    private CRC32 crc = new CRC32();
    private File fileName = null;
    private static final int RECORD_HEADER_LENGTH = 4 * 4;
    private Class valueClass;

    public static class KeyValueTuple<KeyType, ValueType> {

        public KeyValueTuple(KeyType key, ValueType value) {
            this.key = key;
            this.value = value;
        }

        public KeyType key;
        public ValueType value;
        public long recordPos;
    }

    /** constructor
     * 
     * @param fileName the output path (file will be created if it doesn't exist)
     */
    public TimeSeriesDataFile(File fileName, Class valueClass) {
        this.fileName = fileName;
        this.valueClass = valueClass;
    }

    /** append a record to the file ...  
     * 
     * @param key
     * @param value
     * @throws IOException
     */
    public synchronized long appendRecordToLogFile(long key, Writable value) throws IOException {

        LogFileHeader header = new LogFileHeader();

        boolean preExistingHeader = fileName.exists();

        RandomAccessFile file = new RandomAccessFile(fileName, "rw");

        long recordPositionOut = -1;

        try {

            if (preExistingHeader) {

                long headerOffset = readLogFileHeader(file, header);

                if (header._writePos == 0) {
                    recordPositionOut = headerOffset;
                } else {
                    recordPositionOut = header._writePos;
                }
                // seelk to appropriate write position 
                file.seek(recordPositionOut);

            } else {
                recordPositionOut = writeLogFileHeader(file, header);
            }

            DataOutputBuffer buffer = new DataOutputBuffer();

            // write out sync bytes ... 
            buffer.writeInt(SyncBytes);
            // write out placeholder for record length 
            buffer.writeInt(0);
            // write out placeholder for crc 
            buffer.writeLong(0);
            // write out key + value to buffer
            WritableUtils.writeVLong(buffer, key);
            // write out value ... 
            value.write(buffer);
            // write out trailing record size (4 bytes sync + 4 bytes record length + 4 bytes crc + key/value buffer +  
            buffer.writeInt(buffer.getLength());
            // reset crc 
            crc.reset();
            //calc crc 
            crc.update(buffer.getData(), RECORD_HEADER_LENGTH, buffer.getLength() - RECORD_HEADER_LENGTH);
            // ok fix up record ... 
            // write out record length
            // total length - sync bytes(4) - record length(4), at offset 4
            writeInt(buffer.getLength() - 8, 4, buffer.getData());
            // and write out crc
            // at offset 8 (after sync(4) and length(4)
            writeLong(crc.getValue(), 8, buffer.getData());

            // and then the data 
            file.write(buffer.getData(), 0, buffer.getLength());

            // now update header ... 
            header._itemCount += 1;
            header._writePos = file.getFilePointer();
            header._lastRecordLength = buffer.getLength() - 4;
            header._lastRecordKey = key;
            // now write out header anew ... 
            writeLogFileHeader(file, header);

        } finally {
            if (file != null) {
                file.close();
            }
        }

        return recordPositionOut;
    }

    /**
     * read given a position 
     * 
     * @param position file position to start read at 
     * @param maxNumberOfRecords maximum number of records to read 
     * @param optionalMinKeyValue optional min key value to limit read by or -1
     * @return a vector of KeyValueTuples
     * @throws IOException
     */
    public synchronized ArrayList<KeyValueTuple<Long, ValueType>> readFromPos(long position, int maxNumberOfRecords,
            long optionalMinKeyValue) throws IOException {
        ArrayList<KeyValueTuple<Long, ValueType>> valuesOut = new ArrayList<KeyValueTuple<Long, ValueType>>();

        LogFileHeader header = new LogFileHeader();

        if (fileName.exists()) {

            RandomAccessFile file = new RandomAccessFile(fileName, "r");

            try {

                //read header ... 
                long headerOffset = readLogFileHeader(file, header);

                long endOfPrevRecord = position;

                if (position > headerOffset) {

                    file.seek(endOfPrevRecord - 4);

                    // read previous record length 
                    int currentRecordLength = file.readInt();

                    // delegate to common read
                    doCommonRead(valuesOut, file, headerOffset, endOfPrevRecord, currentRecordLength,
                            maxNumberOfRecords, optionalMinKeyValue);

                }
            } finally {
                if (file != null) {
                    file.close();
                }
            }
        }
        return valuesOut;
    }

    /**
     * read from the tail end of the file 
     * 
     * @param maxNumberOfRecords the maximum number of records to read from the tail 
     * @return a list of records at the tail end of the file  
     * @throws IOException
     */
    public synchronized ArrayList<KeyValueTuple<Long, ValueType>> readFromTail(int maxNumberOfRecords,
            long optionalMinKeyValue) throws IOException {

        ArrayList<KeyValueTuple<Long, ValueType>> valuesOut = new ArrayList<KeyValueTuple<Long, ValueType>>();

        LogFileHeader header = new LogFileHeader();

        if (fileName.exists()) {

            RandomAccessFile file = new RandomAccessFile(fileName, "r");

            try {

                //read header ... 
                long headerOffset = readLogFileHeader(file, header);
                // figure out how many records we can read ... 
                int recordsToRead = maxNumberOfRecords;

                if (recordsToRead != 0) {

                    long endOfPrevRecord = header._writePos;

                    // read in first record length ... 
                    int currentRecordLength = header._lastRecordLength;

                    // delegate to common read
                    doCommonRead(valuesOut, file, headerOffset, endOfPrevRecord, currentRecordLength, recordsToRead,
                            optionalMinKeyValue);
                }
            } finally {
                if (file != null) {
                    file.close();
                }
            }
        }

        return valuesOut;
    }

    private void doCommonRead(ArrayList<KeyValueTuple<Long, ValueType>> valuesOut, RandomAccessFile file,
            long headerOffset, long endOfPrevRecord, int currentRecordLength, int recordsToRead,
            long optionalMinKeyValue) throws IOException {

        Buffer recordBuffer = new Buffer();
        DataInputBuffer inputBuffer = new DataInputBuffer();

        // ok start walking backwards ... 
        while (recordsToRead != 0) {
            // setup new previous record pos pointer  
            endOfPrevRecord = endOfPrevRecord - currentRecordLength - 4;
            // and seek to it endOfLastRecord - 4
            file.seek(endOfPrevRecord - 4);

            recordBuffer.setCapacity(currentRecordLength + 8);
            // read in proper amount of data ...
            file.read(recordBuffer.get(), 0, currentRecordLength + 8);
            // ok initialize input buffer ... 
            inputBuffer.reset(recordBuffer.get(), currentRecordLength + 8);
            // now read next record length first ... 
            int nextRecordLength = inputBuffer.readInt();
            // next read sync bytes ... 
            int syncBytes = inputBuffer.readInt();
            // validate 
            if (syncBytes != SyncBytes) {
                throw new IOException("Corrupt Record Detected!");
            }
            // ok read real record bytes ... 
            int realRecordBytes = inputBuffer.readInt();
            // read crc ... 
            long crcValue = inputBuffer.readLong();
            // ok validate crc ...  
            crc.reset();
            crc.update(inputBuffer.getData(), inputBuffer.getPosition(), realRecordBytes - 8);
            if (crcValue != crc.getValue()) {
                throw new IOException("CRC Mismatch!");
            }
            // ok now read key and value 
            try {
                long key = WritableUtils.readVLong(inputBuffer);

                if (optionalMinKeyValue != -1 && key < optionalMinKeyValue) {
                    break;
                }

                ValueType value = (ValueType) valueClass.newInstance();
                value.readFields(inputBuffer);
                KeyValueTuple tuple = new KeyValueTuple<Long, ValueType>(key, value);
                tuple.recordPos = endOfPrevRecord;
                valuesOut.add(0, tuple);

            } catch (Exception e) {
                throw new IOException(e);
            }

            currentRecordLength = nextRecordLength;

            recordsToRead--;

            if (endOfPrevRecord == headerOffset)
                break;
        }
    }

    /**
     * get the key value of the last record in the file 
     * @return record key as a long or -1 if zero records in file 
     * @throws IOException
     */
    public synchronized long getLastRecordKey() throws IOException {
        LogFileHeader header = new LogFileHeader();

        if (fileName.exists()) {

            RandomAccessFile file = new RandomAccessFile(fileName, "r");

            Buffer recordBuffer = new Buffer();
            DataInputBuffer inputBuffer = new DataInputBuffer();
            try {

                //read header ... 
                long headerOffset = readLogFileHeader(file, header);

                return header._lastRecordKey;
            } finally {
                if (file != null) {
                    file.close();
                }
            }
        }
        return -1;
    }

    /**
     * get the number of records in the file 
     * 
     * @return record count in file 
     * @throws IOException
     */
    public synchronized int getRecordCount() throws IOException {

        LogFileHeader header = new LogFileHeader();

        if (fileName.exists()) {

            RandomAccessFile file = new RandomAccessFile(fileName, "r");

            Buffer recordBuffer = new Buffer();
            DataInputBuffer inputBuffer = new DataInputBuffer();
            try {

                //read header ... 
                long headerOffset = readLogFileHeader(file, header);

                return header._itemCount;
            } finally {
                if (file != null) {
                    file.close();
                }
            }
        }
        return 0;
    }

    private static class LogFileHeader {

        public static final int LogFileHeaderBytes = SyncBytes;
        public static final int LogFileVersion = 1;

        public LogFileHeader() {
            _writePos = 0;
            _itemCount = 0;
            _lastRecordLength = 0;
            _lastRecordKey = -1;
        }

        public long _writePos;
        public int _itemCount;
        public int _lastRecordLength;
        public long _lastRecordKey;

        public void writeHeader(DataOutput stream) throws IOException {
            stream.writeInt(LogFileHeaderBytes);
            stream.writeInt(LogFileVersion);
            stream.writeLong(_writePos);
            stream.writeInt(_itemCount);
            stream.writeInt(_lastRecordLength);
            stream.writeLong(_lastRecordKey);
        }

        public void readHeader(DataInput stream) throws IOException {
            int headerBytes = stream.readInt();
            int version = stream.readInt();
            if (headerBytes != LogFileHeaderBytes && version != LogFileVersion) {
                throw new IOException("Invalid CrawlLog File Header Detected!");
            }
            _writePos = stream.readLong();
            _itemCount = stream.readInt();
            _lastRecordLength = stream.readInt();
            _lastRecordKey = stream.readLong();
        }
    }

    private static long writeLogFileHeader(RandomAccessFile file, LogFileHeader header) throws IOException {

        // set the position at zero .. 
        file.seek(0);
        // and write header to disk ... 
        header.writeHeader(file);

        //took sync out because it was becoming a sever bottleneck
        // file.getFD().sync();

        return file.getFilePointer();
    }

    private static long readLogFileHeader(RandomAccessFile file, LogFileHeader header) throws IOException {

        file.seek(0);

        header.readHeader(file);

        return file.getFilePointer();
    }

    static void writeInt(int value, int atOffset, byte[] intoBytes) throws IOException {
        intoBytes[atOffset + 0] = (byte) ((value >>> 24) & 0xFF);
        intoBytes[atOffset + 1] = (byte) ((value >>> 16) & 0xFF);
        intoBytes[atOffset + 2] = (byte) ((value >>> 8) & 0xFF);
        intoBytes[atOffset + 3] = (byte) ((value >>> 0) & 0xFF);
    }

    static void writeLong(long value, int atOffset, byte[] intoBytes) throws IOException {
        intoBytes[atOffset + 0] = (byte) ((value >>> 56) & 0xFF);
        intoBytes[atOffset + 1] = (byte) ((value >>> 48) & 0xFF);
        intoBytes[atOffset + 2] = (byte) ((value >>> 40) & 0xFF);
        intoBytes[atOffset + 3] = (byte) ((value >>> 32) & 0xFF);
        intoBytes[atOffset + 4] = (byte) ((value >>> 24) & 0xFF);
        intoBytes[atOffset + 5] = (byte) ((value >>> 16) & 0xFF);
        intoBytes[atOffset + 6] = (byte) ((value >>> 8) & 0xFF);
        intoBytes[atOffset + 7] = (byte) ((value >>> 0) & 0xFF);
    }
}