qa.qcri.nadeef.core.pipeline.CleanExecutor.java Source code

Java tutorial

Introduction

Here is the source code for qa.qcri.nadeef.core.pipeline.CleanExecutor.java

Source

/*
 * QCRI, NADEEF LICENSE
 * NADEEF is an extensible, generalized and easy-to-deploy data cleaning platform built at QCRI.
 * NADEEF means "Clean" in Arabic
 *
 * Copyright (c) 2011-2013, Qatar Foundation for Education, Science and Community Development (on
 * behalf of Qatar Computing Research Institute) having its principle place of business in Doha,
 * Qatar with the registered address P.O box 5825 Doha, Qatar (hereinafter referred to as "QCRI")
 *
 * NADEEF has patent pending nevertheless the following is granted.
 * NADEEF is released under the terms of the MIT License, (http://opensource.org/licenses/MIT).
 */

package qa.qcri.nadeef.core.pipeline;

import com.google.common.base.Preconditions;
import com.google.common.base.Stopwatch;
import qa.qcri.nadeef.core.datamodel.CleanPlan;
import qa.qcri.nadeef.core.datamodel.NadeefConfiguration;
import qa.qcri.nadeef.core.datamodel.ProgressReport;
import qa.qcri.nadeef.core.datamodel.Violation;
import qa.qcri.nadeef.core.util.sql.DBConnectionPool;
import qa.qcri.nadeef.core.util.sql.DBInstaller;
import qa.qcri.nadeef.tools.DBConfig;
import qa.qcri.nadeef.tools.PerfReport;
import qa.qcri.nadeef.tools.Tracer;

import java.util.HashSet;
import java.util.List;
import java.util.concurrent.TimeUnit;

/**
 * CleanPlan execution logic. It assembles the right pipeline based on the clean plan and
 * drives the cleaning execution.
 */
public class CleanExecutor {

    //<editor-fold desc="Private fields">
    private static Tracer tracer = Tracer.getTracer(CleanExecutor.class);
    private CleanPlan cleanPlan;
    private NodeCacheManager cacheManager;
    private Flow queryFlow;
    private Flow detectFlow;
    private Flow repairFlow;
    private DBConnectionPool connectionPool;
    private ExecutionContext context;
    //</editor-fold>

    //<editor-fold desc="Constructor / Deconstructor">

    /**
     * Constructor. Use default NADEEF default config as DB config.
     * @param cleanPlan input {@link CleanPlan}.
     */
    public CleanExecutor(CleanPlan cleanPlan) throws Exception {
        this(cleanPlan, NadeefConfiguration.getDbConfig());
    }

    /**
     * Constructor.
     * @param cleanPlan input {@link CleanPlan}.
     * @param dbConfig meta data dbconfig.
     */
    public CleanExecutor(CleanPlan cleanPlan, DBConfig dbConfig) throws Exception {
        this.cleanPlan = Preconditions.checkNotNull(cleanPlan);
        this.cacheManager = NodeCacheManager.getInstance();
        this.connectionPool = DBConnectionPool.createDBConnectionPool(cleanPlan.getSourceDBConfig(), dbConfig);
        DBInstaller.install(dbConfig);

        context = ExecutionContext.createExecutorContext();
        context.setConnectionPool(this.connectionPool);
        context.setRule(cleanPlan.getRule());
        assembleFlow();
    }
    //</editor-fold>

    //<editor-fold desc="Incremental methods">
    public void incrementalAppend(String tableName, HashSet<Integer> newTuples) {
        context.addNewTuples(tableName, newTuples);
    }
    //</editor-fold>

    //<editor-fold desc="Public methods">

    /**
     * Returns <code>True</code> when the clean executor is running.
     * @return <code>True</code> when the clean executor is running.
     */
    public synchronized boolean isRunning() {
        return detectFlow.isRunning() || queryFlow.isRunning() || repairFlow.isRunning();
    }

    /**
     * Shutdown the CleanExecutor.
     */
    public void shutdown() {
        if (queryFlow != null) {
            if (queryFlow.isRunning()) {
                queryFlow.forceStop();
            }
        }

        queryFlow = null;

        if (detectFlow != null) {
            if (detectFlow.isRunning()) {
                detectFlow.forceStop();
            }
        }
        detectFlow = null;

        if (repairFlow != null) {
            if (repairFlow.isRunning()) {
                repairFlow.forceStop();
            }
        }

        if (connectionPool != null) {
            connectionPool.shutdown();
        }
        connectionPool = null;
    }

    /**
     * CleanExecutor finalizer.
     */
    @Override
    public void finalize() throws Throwable {
        shutdown();
        super.finalize();
    }

    /**
     * Gets the output from Detect.
     * @return output object from Detect.
     */
    public List<Violation> getDetectViolation() {
        String key = detectFlow.getCurrentOutputKey();
        return cacheManager.get(key);
    }

    /**
     * Gets the output from Repair.
     * @return output object from repair.
     */
    @SuppressWarnings("unchecked")
    public <T> T getRepairOutput() {
        String key = repairFlow.getCurrentOutputKey();
        return (T) cacheManager.get(key);
    }

    /**
     * Gets the current progress percentage of Detect.
     * @return current progress percentage of Detect.
     */
    public double getDetectProgress() {
        return detectFlow.getProgress();
        // return queryFlow.getProgress() * 0.5 + detectFlow.getProgress() * 0.5;
    }

    /**
     * Gets the detail progress information of Detection.
     * @return the detail progress information of Detection.
     */
    public List<ProgressReport> getDetailDetectProgress() {
        return detectFlow.getDetailProgress();
        // List<ProgressReport> queryProgress = queryFlow.getDetailProgress();
        // List<ProgressReport> detectProgress = detectFlow.getDetailProgress();
        // queryProgress.addAll(detectProgress);
        // return queryProgress;
    }

    /**
     * Gets the current percentage of Repair.
     * @return current percentage of Repair.
     */
    public double getRepairProgress() {
        return repairFlow.getProgress();
    }

    /**
     * Gets the detail progress information of Detection.
     * @return the detail progress information of Detection.
     */
    public List<ProgressReport> getDetailRepairProgress() {
        return repairFlow.getDetailProgress();
    }

    /**
     * Runs the violation detection.
     */
    public CleanExecutor detect() {
        Stopwatch sw = Stopwatch.createStarted();

        // queryFlow.reset();
        detectFlow.reset();

        // queryFlow.start();
        detectFlow.start();

        // queryFlow.waitUntilFinish();
        detectFlow.waitUntilFinish();

        // clear the new tuples after every run.
        context.clearNewTuples();

        PerfReport.appendMetric(PerfReport.Metric.DetectTime, sw.elapsed(TimeUnit.MILLISECONDS));

        // TODO: remove it.
        System.gc();
        return this;
    }

    /**
     * Gets the CleanPlan.
     * @return the CleanPlan.
     */
    public CleanPlan getCleanPlan() {
        return cleanPlan;
    }

    /**
     * Runs the violation repair.
     */
    public CleanExecutor repair() {
        Stopwatch sw = Stopwatch.createStarted();
        repairFlow.reset();

        repairFlow.start();
        repairFlow.waitUntilFinish();

        context.clearNewTuples();

        PerfReport.appendMetric(PerfReport.Metric.RepairTime, sw.elapsed(TimeUnit.MILLISECONDS));
        sw.stop();
        // TODO: remove it.
        System.gc();
        return this;
    }

    /**
     * Runs both the detection and repair.
     */
    public synchronized CleanExecutor run() {
        detect();
        repair();
        return this;
    }
    //</editor-fold>

    //<editor-fold desc="Private members">
    /**
     * Assemble the workflow on demand.
     */
    @SuppressWarnings("unchecked")
    private void assembleFlow() {
        try {
            // assemble the query flow.
            /*
            queryFlow = new Flow("query");
            queryFlow
            .setInputKey(cacheManager.getKeyForNothing())
            .addNode(new SourceImport(context))
            .addNode(new ScopeOperator(context))
            .addNode(new Iterator(context));
                
            // assemble the detect flow
            detectFlow = new Flow("detect");
            detectFlow
            .setInputKey(cacheManager.getKeyForNothing())
            .addNode(new ViolationDetector(context), 6)
            // .addNode(new ViolationExport(context))
            .addNode(new ViolationExportToCSV(context))
            .addNode(new ViolationCSVExport(context));
            */

            detectFlow = new Flow("detect");
            detectFlow.setInputKey(cacheManager.getKeyForNothing()).addNode(new SourceImport(context))
                    .addNode(new ScopeOperator(context)).addNode(new DirectIterator(context))
                    .addNode(new ViolationExportToCSV(context)).addNode(new ViolationCSVExport(context));

            // assemble the repair flow
            repairFlow = new Flow("repair");
            repairFlow.setInputKey(cacheManager.getKeyForNothing()).addNode(new ViolationImport(context))
                    .addNode(new ViolationRepair(context), 6).addNode(new FixExport(context));

        } catch (Exception ex) {
            tracer.err("Exception happens during assembling the pipeline ", ex);
        }
    }
    //</editor-fold>
}