org.noroomattheinn.timeseries.PersistentTS.java Source code

Introduction

Here is the source code for org.noroomattheinn.timeseries.PersistentTS.java
Source

/*
 * PersistentTS.java - Copyright(c) 2014 Joe Pasqua
 * Provided under the MIT License. See the LICENSE file for details.
 * Created: Nov 25, 2014
 */
package org.noroomattheinn.timeseries;

import com.google.common.collect.Range;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintStream;
import java.util.Date;
import java.util.Timer;
import java.util.TimerTask;
import static org.noroomattheinn.timeseries.TSBase.logger;

/**
 * PersistentTS: A persistent repository for time series data.
 *
 * A PersistentTS is represented by a header file and a data file.
 * 
 * The header file contains two lines:
 * VERSION:
 *      A number that corresponds to the implementation that wrote the repository
 * STRING[\tSTRING]*
 *      A tab-separated list of strings. Each String represents the name of 
 *      a column that is stored in the data file
 * 
 * The data file contains lines that are either comments or data rows:
 * COMMENT: Any line beginning with a # is an uninterpreted comment
 * DATA ROW: All data rows have the form:
 *      TIMESTAMP BITVECTOR VAL[\tVAL\]*
 * where
 *      TIMESTAMP is a long which indicating the time of the sample. This value
 *      is delta-encoded meaning you must accumulate values up to a row in
 *      order to know the timestamp of that row. If the stored value is negative
 *      then it represents an absolute (not delta-encoded) value given by abs();
 * 
 *      BITVECTOR is the hex representation of a 64-bit bit vector
 *      which indicates which samples were recorded at this timestamp
 * 
 *      VAL+ is a tab separated list of values. There must be as
 *      many values in this list as 1 bits in the bit vector.
 *      A value may be:<ul>
 *      <li>A double value represented as a String</li>
 *      <li>The literal "*" which indicates that this value 
 *      is the same as the last recorded value of this column.</li>
 *      <li>The literal "!" which indicates that this value 
 *      should be ignored and removed from the bit vector. This
 *      can be used to take the place of NaN or INF values.</li>
 *      </ul>
 * 
 * @author Joe Pasqua <joe at NoRoomAtTheInn dot org>
 */
public class PersistentTS extends TSBase {
    /*------------------------------------------------------------------------------
     *
     * Constants and Enums
     * 
     *----------------------------------------------------------------------------*/
    private static final int RepoVersion = 1;
    private static final long FlushInterval = 20 * 1000L;

    /*------------------------------------------------------------------------------
     *
     * Internal State
     * 
     *----------------------------------------------------------------------------*/

    private final Repo repo; // The underlying repository
    private final Emitter emitter; // Used to write rows
    private final Timer timer; // To manage flushing
    private Row pendingRow; // Used to merge rows if needed
    private long timeOfFirstRow; // The oldest data in the series

    /*==============================================================================
     * -------                                                               -------
     * -------              Public Interface To This Class                   ------- 
     * -------                                                               -------
     *============================================================================*/

    /**
     * Create PersistentTimeSeries object that is ready to take writes
     * 
     * @param container     The directory that should contain the persistent store
     * @param baseName      The baseName of the persistent store files
     * @param descriptor    Describes the schema of the rows in the store
     * @param forceOrdering If true, then all data added to the time series
     *                      will be forced to have monotonically increasing
     *                      timestamps. If a row or value is added whose time-
     *                      stamp is less than a value that has already been
     *                      added, the newer timestamp will be used.
     *                      If false, an old timestamp will result in an
     *                      IllegalArgumentException
     */
    public PersistentTS(File container, String baseName, RowDescriptor schema, boolean forceOrdering)
            throws IOException {
        super(schema);

        this.repo = Repo.getRepo(container, baseName, schema);
        this.emitter = new Emitter(forceOrdering);
        this.pendingRow = null;
        this.timer = new Timer();

        timer.schedule(new TimerTask() {
            @Override
            public void run() {
                flush();
            }
        }, FlushInterval);

        timeOfFirstRow = Long.MAX_VALUE; // If no rows...
        streamRows(Range.<Long>all(), new RowCollector() {
            @Override
            public boolean collect(Row r) {
                timeOfFirstRow = r.timestamp;
                return false;
            }
        });
    }

    public static boolean repoExistsFor(File container, String baseName) {
        return Repo.repoExistsFor(container, baseName);
    }

    /*------------------------------------------------------------------------------
     *
     * Methods overriden from TimeSeries
     * 
     *----------------------------------------------------------------------------*/

    @Override
    public long firstTime() {
        return timeOfFirstRow;
    }

    @Override
    public synchronized Row storeRow(Row r) throws IllegalArgumentException {
        if (pendingRow == null) {
            pendingRow = r;
        } else {
            if (deflate(r.timestamp) == deflate(pendingRow.timestamp)) {
                pendingRow.mergeWith(r);
                logger.info("Merging");
            } else {
                emitter.emit(pendingRow);
                pendingRow = r;
            }
        }

        return r;
    }

    @Override
    public final synchronized void streamRows(Range<Long> period, RowCollector collector) {
        double accumulator[] = new double[schema.nColumns];
        if (period == null)
            period = Range.all();
        long fromTime = period.hasLowerBound() ? period.lowerEndpoint() : 0L;
        long toTime = period.hasUpperBound() ? period.upperEndpoint() : Long.MAX_VALUE;
        long prevTime = 0;
        BufferedReader rdr = null;
        try {
            rdr = repo.getReader();
            String line;
            while ((line = rdr.readLine()) != null) {
                if (line.startsWith("#")) {
                    continue;
                }
                String[] tokens = line.split("\t");

                // The first entry on the line is the time in delta format
                Long time = longValue(tokens[0]);
                if (time == null) {
                    continue;
                } // Invalid format, ignore this line
                time = time < 0 ? -time : time + prevTime;
                prevTime = time; // Keep a running tally of the current time

                time = inflate(time);
                if (time < fromTime)
                    continue; // Out of range, ignore & move on
                if (time > toTime)
                    break; // Out of range, ignore & stop

                Row row = new Row(time, 0L, schema.nColumns);

                // The second element is a bitvector corresponding to which
                // columns have values on this line
                Long bitVector = longValue("0x" + tokens[1]);
                if (bitVector == null) {
                    continue;
                } // Invalid format, Ignore this line
                row.bitVector = bitVector;

                // The remaining entries are readings. There is one reading for
                // each 1 bit in the bitvector. The positions in the bitvector
                // correspond to the columns in the order initially specified
                long bit = 1;
                int tokenIndex = 2;
                for (int i = 0; i < schema.nColumns; i++) {
                    row.values[i] = accumulator[i]; // Start off with the previous value
                    if (row.includes(bit)) {
                        String valString = tokens[tokenIndex++];
                        switch (valString) {
                        case "*":
                            break;
                        case "!":
                            row.clear(bit);
                            break;
                        default:
                            Double val = doubleValue(valString);
                            if (val == null) {
                                row.clear(bit);
                            } else {
                                accumulator[i] = row.values[i] = val.doubleValue();
                            }
                            break;
                        }
                    } else {
                        row.values[i] = accumulator[i];
                    }
                    bit = bit << 1;
                }
                if (!collector.collect(row))
                    break;
            }
        } catch (IOException ex) {
            logger.severe("Error loading from repository" + ex);
        }
        if (rdr != null)
            try {
                rdr.close();
            } catch (IOException e) {
                logger.warning("Failure closing reader: " + e);
            }
    }

    @Override
    public synchronized void flush() {
        if (pendingRow != null) {
            emitter.emit(pendingRow);
            pendingRow = null;
        }
        repo.flush();
    }

    @Override
    public synchronized void close() {
        flush();
        repo.close();
        timer.cancel();
    }

    /*------------------------------------------------------------------------------
     *
     * PRIVATE - Utility methods
     * 
     *----------------------------------------------------------------------------*/

    private static Long longValue(String valString) {
        try {
            return Long.decode(valString);
        } catch (NumberFormatException e) {
            logger.warning("Invalid Long in TimeSeries: " + valString);
            return null;
        }
    }

    private static Double doubleValue(String valString) {
        try {
            return Double.valueOf(valString);
        } catch (NumberFormatException e) {
            logger.warning("Invalid Double in TimeSeries: " + valString);
            return null;
        }
    }

    private static long deflate(long timestamp) {
        return timestamp / 100;
    }

    private static long inflate(long timestamp) {
        return timestamp * 100;
    }

    private class Emitter {
        private Row lastRowEmitted;
        private final PrintStream ps;
        private final boolean forceOrdering;

        Emitter(boolean forceOrdering) {
            this.lastRowEmitted = null;
            this.forceOrdering = forceOrdering;
            this.ps = repo.getPrintStream();
        }

        Row emit(Row r) throws IllegalArgumentException {
            // Emit the timestamp for the row
            ps.print(adjustTimeIfNeeded(r.timestamp));

            // Emit the bit vector describing which columns are included
            ps.append("\t");
            ps.append(Long.toHexString(r.bitVector));

            // Emit the column values
            long bitForColumn = 1;
            for (int i = 0; i < schema.nColumns; i++) {
                if (r.includes(bitForColumn)) {
                    ps.append("\t");
                    double val = r.values[i];
                    if (Double.isInfinite(val) || Double.isNaN(val)) {
                        ps.print("!");
                    } else if (lastRowEmitted != null && val == lastRowEmitted.values[i]) {
                        ps.print("*");
                    } else {
                        ps.print(val);
                    }
                }
                bitForColumn = bitForColumn << 1;
            }
            ps.println();

            lastRowEmitted = r;
            return r;
        }

        private long adjustTimeIfNeeded(long newTime) {
            if (lastRowEmitted == null) {
                return -deflate(newTime);
            } else {
                long oldTime = lastRowEmitted.timestamp;
                long time = deflate(newTime) - deflate(oldTime);
                if (time < 0) {
                    if (forceOrdering) {
                        time = deflate(lastRowEmitted.timestamp);
                        logger.fine("Forcing timestamps into sequence: " + newTime + ", " + oldTime);
                    } else
                        throw new IllegalArgumentException(
                                "Timestamps out of sequence: " + newTime + ", " + oldTime);
                }
                return time;
            }
        }
    }

    /*------------------------------------------------------------------------------
     *
     * PRIVATE - The class implementing the filed-based repository
     * 
     *----------------------------------------------------------------------------*/

    private static class Repo {
        private final RowDescriptor schema;
        private final File dataFile;
        private final File hdrFile;
        private PrintStream ps;

        private Repo(File container, String name, RowDescriptor schema) {
            this.schema = schema;
            this.dataFile = dataFile(container, name);
            this.hdrFile = headerFile(container, name);
            this.ps = null;
        }

        static boolean repoExistsFor(File container, String baseName) {
            File header = headerFile(container, baseName);
            File data = dataFile(container, baseName);
            boolean hdrExists = header.exists();
            boolean dataExists = data.exists();
            return hdrExists && dataExists;
        }

        public void flush() {
            if (ps != null)
                ps.flush();
        }

        public void close() {
            if (ps != null)
                ps.close();
        }

        static Repo getRepo(File container, String name, RowDescriptor schema) throws IOException {
            Repo repo = new Repo(container, name, schema);
            if (!repo.hdrFile.exists() && repo.dataFile.exists()) {
                // Danger! The data file has become "disconnected" from the
                // header file. Don't create a new data file - the data is valuable
                // Don't just create a new header file because you don't know
                // if the schemas match. It's safest to raise an exception.
                throw new FileNotFoundException("Data file without Header file");
            }

            repo.ensureValidHeader();
            if (!repo.dataFile.exists())
                repo.createDataFile();
            repo.ps = new PrintStream(new FileOutputStream(repo.dataFile, true));
            return repo;
        }

        public PrintStream getPrintStream() {
            return ps;
        }

        public BufferedReader getReader() throws FileNotFoundException {
            return new BufferedReader(new FileReader(dataFile));
        };

        private void ensureValidHeader() throws IOException {
            if (!hdrFile.exists()) {
                createHeaderFile();
                return;
            }

            // Read the existing header file and make sure it's valid
            String line;
            BufferedReader reader = new BufferedReader(new FileReader(hdrFile));

            line = reader.readLine();
            if (line == null)
                throw new IOException("Empty Header File");

            int version = Integer.valueOf(line);
            if (version > RepoVersion)
                throw new IOException("Can't read newer repo version :" + version + " vs " + RepoVersion);

            line = reader.readLine();
            if (line == null)
                throw new IOException("Missing column name declarations");

            String[] declaredNames = line.split("\t");
            if (declaredNames.length > schema.nColumns) {
                throw new IOException("Mismatched column names - too few supplied names");
            }

            for (int i = 0; i < declaredNames.length; i++) {
                if (!declaredNames[i].equals(schema.columnNames[i])) {
                    throw new IOException("Mismatched column names");
                }
            }
            reader.close();

            if (schema.nColumns > declaredNames.length) {
                logger.info("Adding new column(s)");
                createHeaderFile(); // We've got new columns! Overwrite the header file
            }
        }

        private void createHeaderFile() throws FileNotFoundException {
            PrintStream writer = new PrintStream(new FileOutputStream(hdrFile, false));
            writer.format("%d\n", RepoVersion);
            int lastIndex = schema.nColumns - 1;
            int index = 0;
            while (true) {
                writer.append(schema.columnNames[index]);
                if (index++ != lastIndex)
                    writer.append("\t");
                else
                    break;
            }
            writer.close();
        }

        private void createDataFile() throws FileNotFoundException {
            PrintStream writer = new PrintStream(new FileOutputStream(dataFile), false);
            writer.format("# %s\n", (new Date().toString()));
            writer.close();
        }

        private static File headerFile(File container, String baseName) {
            return new File(container, baseName + ".pts.hdr");
        }

        private static File dataFile(File container, String baseName) {
            return new File(container, baseName + ".pts.data");
        }

    }
}