au.edu.uq.cmm.paul.grabber.Analyser.java Source code

Java tutorial

Introduction

Here is the source code for au.edu.uq.cmm.paul.grabber.Analyser.java

Source

/*
* Copyright 2012, CMM, University of Queensland.
*
* This file is part of Paul.
*
* Paul is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Paul is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with Paul. If not, see <http://www.gnu.org/licenses/>.
*/

package au.edu.uq.cmm.paul.grabber;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Comparator;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;

import javax.persistence.EntityManager;
import javax.persistence.EntityManagerFactory;
import javax.persistence.TypedQuery;

import org.apache.commons.collections.Predicate;
import org.apache.commons.collections.PredicateUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import au.edu.uq.cmm.paul.Paul;
import au.edu.uq.cmm.paul.queue.QueueManager.DateRange;
import au.edu.uq.cmm.paul.status.Facility;
import au.edu.uq.cmm.paul.status.FacilityStatusManager;
import au.edu.uq.cmm.paul.watcher.UncPathnameMapper;

/**
 * This variation on the DataGrabber gathers DatasetMetadata records all files
 * in a facility's directory tree, and compares them against the records in the DB.
 * The analyser also performs some basic integrity checks on the queue.
 * 
 * @author scrawley
 */
public class Analyser extends AbstractFileGrabber {

    private static Logger LOG = LoggerFactory.getLogger(Analyser.class);

    public enum ProblemType {
        METADATA_MISSING, METADATA_SIZE, FILE_MISSING, FILE_SIZE, FILE_SIZE_2, FILE_HASH, FILE_HASH_2, IO_ERROR;
    }

    private static final Comparator<DatasetMetadata> ORDER_BY_BASE_PATH_AND_TIME = new Comparator<DatasetMetadata>() {
        @Override
        public int compare(DatasetMetadata o1, DatasetMetadata o2) {
            int res = o1.getSourceFilePathnameBase().compareTo(o2.getSourceFilePathnameBase());
            if (res == 0) {
                res = Long.compare(o1.getLastFileTimestamp().getTime(), o2.getLastFileTimestamp().getTime());
            }
            return res;
        }
    };

    private static final Comparator<DatasetMetadata> ORDER_BY_BASE_PATH_AND_TIME_AND_ID = new Comparator<DatasetMetadata>() {
        @Override
        public int compare(DatasetMetadata o1, DatasetMetadata o2) {
            int res = o1.getSourceFilePathnameBase().compareTo(o2.getSourceFilePathnameBase());
            if (res == 0) {
                res = Long.compare(o1.getLastFileTimestamp().getTime(), o2.getLastFileTimestamp().getTime());
            }
            if (res == 0) {
                res = o1.getId().compareTo(o2.getId());
            }
            return res;
        }
    };

    private BlockingQueue<Runnable> queue = new LinkedBlockingQueue<Runnable>();
    private FacilityStatusManager fsm;
    private EntityManagerFactory emf;
    private UncPathnameMapper uncNameMapper;
    private List<Group> grouped;
    private Statistics all;
    private Statistics beforeLWM;
    private Statistics intertidal;
    private Statistics afterHWM;
    private Problems problems;
    private Statistics beforeQStart;
    private Statistics inQueue;
    private Statistics afterQEnd;

    private Date lwm;
    private Date hwm;
    private Date qStart;
    private Date qEnd;
    private Date fStart;
    private Date fEnd;
    private boolean checkHashes;

    public Analyser(Paul services, Facility facility) {
        super(services, facility);
        fsm = services.getFacilityStatusManager();
        uncNameMapper = services.getUncNameMapper();
        emf = services.getEntityManagerFactory();
    }

    public Analyser analyse(Date lwmTimestamp, Date hwmTimestamp, DateRange queueRange, boolean checkHashes) {
        this.lwm = lwmTimestamp;
        this.hwm = hwmTimestamp;
        if (queueRange == null) {
            this.qStart = null;
            this.qEnd = null;
        } else {
            this.qStart = queueRange.getFromDate();
            this.qEnd = queueRange.getToDate();
        }
        this.checkHashes = checkHashes;
        LOG.info("Analysing queues and folders for " + getFacility().getFacilityName());
        SortedSet<DatasetMetadata> inFolder = buildInFolderMetadata();
        SortedSet<DatasetMetadata> inDatabase = buildInDatabaseMetadata();
        LOG.debug("Got " + inFolder.size() + " in folders and " + inDatabase.size() + " in database");
        LOG.info("Grouping datasets for " + getFacility().getFacilityName());
        grouped = groupDatasets(inFolder, inDatabase);
        LOG.debug("Got " + grouped.size() + " groups");
        LOG.info("Gathering statistics for " + getFacility().getFacilityName());
        determineFolderRange(inFolder);
        all = gatherStats(grouped, PredicateUtils.truePredicate());
        if (hwmTimestamp == null || lwmTimestamp == null) {
            beforeLWM = null;
            afterHWM = null;
            intertidal = null;
        } else {
            final long lwmTime = lwmTimestamp.getTime();
            beforeLWM = gatherStats(grouped, new Predicate() {
                public boolean evaluate(Object metadata) {
                    return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() < lwmTime;
                }
            });
            final long hwmTime = hwmTimestamp.getTime();
            afterHWM = gatherStats(grouped, new Predicate() {
                public boolean evaluate(Object metadata) {
                    return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() > hwmTime;
                }
            });
            intertidal = gatherStats(grouped, new Predicate() {
                public boolean evaluate(Object metadata) {
                    long time = ((DatasetMetadata) metadata).getLastFileTimestamp().getTime();
                    return time >= lwmTime && time <= hwmTime;
                }
            });
        }
        if (queueRange == null) {
            afterQEnd = null;
            beforeQStart = null;
            inQueue = null;
        } else {
            final long qStart = this.qStart.getTime();
            beforeQStart = gatherStats(grouped, new Predicate() {
                public boolean evaluate(Object metadata) {
                    return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() < qStart;
                }
            });
            final long qEnd = this.qEnd.getTime();
            afterQEnd = gatherStats(grouped, new Predicate() {
                public boolean evaluate(Object metadata) {
                    return ((DatasetMetadata) metadata).getLastFileTimestamp().getTime() > qEnd;
                }
            });
            inQueue = gatherStats(grouped, new Predicate() {
                public boolean evaluate(Object metadata) {
                    long ts = ((DatasetMetadata) metadata).getLastFileTimestamp().getTime();
                    return ts >= qStart && ts <= qEnd;
                }
            });
        }
        LOG.info("Performing queue entry integrity checks for " + getFacility().getFacilityName());
        problems = integrityCheck(grouped);
        return this;
    }

    private void determineFolderRange(SortedSet<DatasetMetadata> inFolder) {
        if (inFolder.isEmpty()) {
            fStart = null;
            fEnd = null;
        } else {
            Iterator<DatasetMetadata> it = inFolder.iterator();
            DatasetMetadata ds = it.next();
            fStart = fEnd = ds.getLastFileTimestamp();
            while (it.hasNext()) {
                ds = it.next();
                Date ts = ds.getLastFileTimestamp();
                if (ts.getTime() < fStart.getTime()) {
                    fStart = ts;
                } else if (ts.getTime() > fEnd.getTime()) {
                    fEnd = ts;
                }
            }
        }
    }

    private Problems integrityCheck(List<Group> grouped) {
        List<Problem> problems = new ArrayList<Problem>();
        for (Group group : grouped) {
            // Check only the latest queue entry.  Older ones are not really 
            // relevant, and besides they typically have the "problem" that 
            // one or more captured component datafiles no longer matches the
            // in-folder dataset. (Which has typically been recaptured.)
            DatasetMetadata dataset = group.getLatestInDatabase();
            if (dataset == null) {
                continue;
            }
            File adminFile = new File(dataset.getMetadataFilePathname());
            if (!adminFile.exists()) {
                logProblem(dataset, null, ProblemType.METADATA_MISSING, problems,
                        "Metadata file missing: " + adminFile);
            } else if (adminFile.length() == 0) {
                logProblem(dataset, null, ProblemType.METADATA_SIZE, problems, "Metadata file empty: " + adminFile);
            }
            for (DatafileMetadata datafile : dataset.getDatafiles()) {
                try {
                    String hash = checkHashes ? datafile.getDatafileHash() : null;
                    if (checkHashes) {
                        LOG.debug("stored hash - " + hash);
                    }
                    File file = new File(datafile.getCapturedFilePathname());
                    if (!file.exists()) {
                        logProblem(dataset, datafile, ProblemType.FILE_MISSING, problems,
                                "Data file missing: " + file);
                    } else if (file.length() != datafile.getFileSize()) {
                        logProblem(dataset, datafile, ProblemType.FILE_SIZE, problems,
                                "Data file size mismatch: " + file + ": admin metadata says "
                                        + datafile.getFileSize() + " but actual captured file size is "
                                        + file.length());
                    } else if (hash != null && !hash.equals(HashUtils.fileHash(file))) {
                        logProblem(dataset, datafile, ProblemType.FILE_HASH, problems,
                                "Data file hash mismatch between metadata and " + file);
                    } else if (checkHashes) {
                        LOG.debug("captured hash - " + HashUtils.fileHash(file));
                    }
                    File source = new File(datafile.getSourceFilePathname());
                    if (source.exists()) {
                        if (source.length() != datafile.getFileSize()) {
                            logProblem(dataset, datafile, ProblemType.FILE_SIZE_2, problems,
                                    "Data file size mismatch: " + file + ": original file size is "
                                            + source.length() + " but admin metadata says "
                                            + datafile.getFileSize());
                        } else if (hash != null && !hash.equals(HashUtils.fileHash(source))) {
                            logProblem(dataset, datafile, ProblemType.FILE_HASH_2, problems,
                                    "Data file hash mismatch between metadata and " + source);
                        } else if (checkHashes) {
                            LOG.debug("source hash - " + HashUtils.fileHash(source));
                        }
                    }
                } catch (IOException ex) {
                    LOG.error("Unexpected IOException while checking hashes", ex);
                    logProblem(dataset, datafile, ProblemType.IO_ERROR, problems,
                            "IO error while checking file hashes - see logs");

                }
            }
        }
        LOG.info("Queue integrity check for '" + getFacility().getFacilityName() + "' found " + problems.size()
                + " problems (listed above)");
        return new Problems(problems);
    }

    private void logProblem(DatasetMetadata dataset, DatafileMetadata datafile, ProblemType type,
            List<Problem> list, String details) {
        LOG.info("Problem in dataset #" + dataset.getId() + ": " + details);
        list.add(new Problem(dataset, datafile, type, details));
    }

    private Statistics gatherStats(List<Group> grouped, Predicate predicate) {
        int datasetsInFolder = 0;
        int datasetsInDatabase = 0;
        int datasetsUnmatchedInFolder = 0;
        int groupsUnmatchedInDatabase = 0;
        int groupsWithDuplicatesInDatabase = 0;
        int groupsInDatabase = 0;
        for (Group group : grouped) {
            if (group.getInFolder() != null && predicate.evaluate(group.getInFolder())) {
                datasetsInFolder++;
                if (group.getAllInDatabase().size() == 0) {
                    datasetsUnmatchedInFolder++;
                }
            }
            int inDatabase = 0;
            boolean matched = false;
            for (DatasetMetadata dataset : group.getAllInDatabase()) {
                if (predicate.evaluate(dataset)) {
                    inDatabase++;
                    if (group.inFolder != null && matches(group.inFolder, dataset)) {
                        matched = true;
                    }
                }
            }
            datasetsInDatabase += inDatabase;
            if (!matched && group.inFolder != null) {
                groupsUnmatchedInDatabase++;
            }
            if (inDatabase > 1) {
                groupsWithDuplicatesInDatabase++;
            }
            if (inDatabase > 0) {
                groupsInDatabase++;
            }
        }
        return new Statistics(datasetsInFolder, datasetsInDatabase, groupsInDatabase,
                groupsWithDuplicatesInDatabase, datasetsUnmatchedInFolder, groupsUnmatchedInDatabase);
    }

    static boolean matches(DatasetMetadata d1, DatasetMetadata d2) {
        return d1.getSourceFilePathnameBase().equals(d2.getSourceFilePathnameBase())
                && d1.getLastFileTimestamp().getTime() == d2.getLastFileTimestamp().getTime();
    }

    private List<Group> groupDatasets(Collection<DatasetMetadata> inFolder,
            Collection<DatasetMetadata> inDatabase) {
        ArrayList<Group> groups = createGroupsFromDatabase(inDatabase);
        groups = mergeGroupsFromFolder(groups, inFolder);
        return groups;
    }

    private ArrayList<Group> createGroupsFromDatabase(Collection<DatasetMetadata> inDatabase) {
        ArrayList<Group> groups = new ArrayList<Group>();
        Group group = null;
        for (DatasetMetadata dataset : inDatabase) {
            if (!intertidal(dataset.getCaptureTimestamp()) && !intertidal(dataset.getLastFileTimestamp())) {
                continue;
            }
            String pathname = dataset.getSourceFilePathnameBase();
            if (group == null || !group.getBasePathname().equals(pathname)) {
                group = new Group(pathname);
                groups.add(group);
            }
            group.addInDatabase(dataset);
        }
        return groups;
    }

    private boolean intertidal(Date timestamp) {
        return (lwm == null || timestamp.getTime() >= lwm.getTime())
                && (hwm == null || timestamp.getTime() <= hwm.getTime());
    }

    private ArrayList<Group> mergeGroupsFromFolder(ArrayList<Group> groups, Collection<DatasetMetadata> inFolder) {
        ArrayList<Group> res = new ArrayList<Group>();
        Iterator<Group> git = groups.iterator();
        Iterator<DatasetMetadata> dit = inFolder.iterator();
        Group group = git.hasNext() ? git.next() : null;
        DatasetMetadata dataset = dit.hasNext() ? dit.next() : null;
        while (group != null || dataset != null) {
            if (dataset == null) {
                res.add(group);
                group = git.hasNext() ? git.next() : null;
            } else if (group == null) {
                if (intertidal(dataset.getLastFileTimestamp())) {
                    Group newGroup = new Group(dataset.getSourceFilePathnameBase());
                    newGroup.setInFolder(dataset);
                    res.add(newGroup);
                }
                dataset = dit.hasNext() ? dit.next() : null;
            } else {
                int cmp = group.getBasePathname().compareTo(dataset.getSourceFilePathnameBase());
                if (cmp == 0) {
                    res.add(group);
                    group.setInFolder(dataset);
                    group = git.hasNext() ? git.next() : null;
                    dataset = dit.hasNext() ? dit.next() : null;
                } else if (cmp < 0) {
                    res.add(group);
                    group = git.hasNext() ? git.next() : null;
                } else {
                    if (intertidal(dataset.getLastFileTimestamp())) {
                        Group newGroup = new Group(dataset.getSourceFilePathnameBase());
                        newGroup.setInFolder(dataset);
                        res.add(newGroup);
                    }
                    dataset = dit.hasNext() ? dit.next() : null;
                }
            }
        }
        return res;
    }

    private SortedSet<DatasetMetadata> buildInDatabaseMetadata() {
        TreeSet<DatasetMetadata> inDatabase = new TreeSet<DatasetMetadata>(ORDER_BY_BASE_PATH_AND_TIME_AND_ID);
        EntityManager em = emf.createEntityManager();
        try {
            TypedQuery<DatasetMetadata> query = em.createQuery(
                    "from DatasetMetadata m left join fetch m.datafiles " + "where m.facilityName = :name",
                    DatasetMetadata.class);
            query.setParameter("name", getFacility().getFacilityName());
            for (DatasetMetadata ds : query.getResultList()) {
                if (inDatabase.add(ds)) {
                    ds.getDatafiles().size();
                }
            }
        } finally {
            em.close();
        }
        return inDatabase;
    }

    private SortedSet<DatasetMetadata> buildInFolderMetadata() {
        TreeSet<DatasetMetadata> inFolder = new TreeSet<DatasetMetadata>(ORDER_BY_BASE_PATH_AND_TIME);
        String folderName = getFacility().getFolderName();
        if (folderName == null) {
            return inFolder;
        }
        File localDir = uncNameMapper.mapUncPathname(folderName);
        if (localDir == null) {
            return inFolder;
        }
        fsm.getStatus(getFacility()).setLocalDirectory(localDir);
        analyseTree(localDir, Long.MIN_VALUE, Long.MAX_VALUE);
        for (Runnable runnable : queue) {
            WorkEntry entry = (WorkEntry) runnable;
            SessionDetails session = fsm.getSessionDetails(getFacility(), entry.getTimestamp().getTime(),
                    entry.getBaseFile());
            entry.pretendToGrabFiles();
            inFolder.add(entry.assembleDatasetMetadata(null, session, new File("")));
        }
        return inFolder;
    }

    @Override
    protected void enqueueWorkEntry(WorkEntry entry) {
        queue.add(entry);
    }

    public final List<Group> getGrouped() {
        return grouped;
    }

    public final Statistics getAll() {
        return all;
    }

    public final Statistics getBeforeLWM() {
        return beforeLWM;
    }

    public final Statistics getIntertidal() {
        return intertidal;
    }

    public final Statistics getAfterHWM() {
        return afterHWM;
    }

    public final Statistics getBeforeQStart() {
        return beforeQStart;
    }

    public final Statistics getInQueue() {
        return inQueue;
    }

    public final Statistics getAfterQEnd() {
        return afterQEnd;
    }

    public final Problems getProblems() {
        return problems;
    }

    public final Date getLWM() {
        return lwm;
    }

    public final Date getHWM() {
        return hwm;
    }

    public final Date getqStart() {
        return qStart;
    }

    public final Date getqEnd() {
        return qEnd;
    }

    public final Date getfStart() {
        return fStart;
    }

    public final Date getfEnd() {
        return fEnd;
    }

    public final void setProblems(Problems problems) {
        this.problems = problems;
    }

    @Override
    protected boolean isShutDown() {
        return false;
    }
}