gridool.db.partitioning.phihash.csv.normal.CsvPartitioningTask.java Source code

Java tutorial

Introduction

Here is the source code for gridool.db.partitioning.phihash.csv.normal.CsvPartitioningTask.java

Source

/*
 * @(#)$Id$
 *
 * Copyright 2006-2008 Makoto YUI
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * Contributors:
 *     Makoto YUI - initial implementation
 */
package gridool.db.partitioning.phihash.csv.normal;

import gridool.GridException;
import gridool.GridJob;
import gridool.GridJobFuture;
import gridool.GridKernel;
import gridool.GridNode;
import gridool.GridResourceRegistry;
import gridool.Settings;
import gridool.annotation.GridKernelResource;
import gridool.annotation.GridRegistryResource;
import gridool.construct.GridTaskAdapter;
import gridool.db.helpers.DBAccessor;
import gridool.db.helpers.ForeignKey;
import gridool.db.helpers.GridDbUtils;
import gridool.db.helpers.PrimaryKey;
import gridool.db.partitioning.phihash.DBPartitioningJobConf;
import gridool.db.partitioning.phihash.PartitioningJobType;
import gridool.db.partitioning.phihash.csv.PartitioningJobConf;
import gridool.util.collections.ArrayQueue;
import gridool.util.collections.BoundedArrayQueue;
import gridool.util.concurrent.DirectExecutorService;
import gridool.util.concurrent.ExecutorFactory;
import gridool.util.concurrent.ExecutorUtils;
import gridool.util.csv.CsvReader;
import gridool.util.csv.SimpleCsvReader;
import gridool.util.io.FastBufferedInputStream;
import gridool.util.io.IOUtils;
import gridool.util.primitive.MutableInt;
import gridool.util.primitive.MutableLong;
import gridool.util.primitive.Primitives;
import gridool.util.struct.Pair;
import gridool.util.system.SystemUtils;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PushbackReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadPoolExecutor;

import javax.annotation.Nonnull;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
 * 
 * <DIV lang="en"></DIV>
 * <DIV lang="ja"></DIV>
 * 
 * @author Makoto YUI (yuin405@gmail.com)
 */
public class CsvPartitioningTask extends GridTaskAdapter {
    private static final long serialVersionUID = -4477383489963213348L;
    private static final Log LOG = LogFactory.getLog(CsvPartitioningTask.class);

    private static final int csvInputBufSize;
    private static final int DEFAULT_SHUFFLE_UNITS;
    private static final int DEFAULT_SHUFFLE_THREADS;
    static {
        csvInputBufSize = Primitives.parseInt(Settings.get("gridool.db.partitioning.csv_reader.bufsize"),
                32 * 1024);
        DEFAULT_SHUFFLE_UNITS = Primitives.parseInt(Settings.get("gridool.db.partitioning.shuffle_units"), 20000);
        int defaultNumThread = Math.max(2, SystemUtils.availableProcessors() - 1);
        DEFAULT_SHUFFLE_THREADS = Primitives.parseInt(Settings.get("gridool.db.partitioning.shuffle_threads"),
                defaultNumThread);
    }

    @Nonnull
    protected final DBPartitioningJobConf jobConf;

    // ------------------------
    // injected resources

    @GridKernelResource
    protected transient GridKernel kernel;

    @GridRegistryResource
    private transient GridResourceRegistry registry;

    // ------------------------
    // working resources

    protected transient int shuffleUnits = DEFAULT_SHUFFLE_UNITS; // line 200 bytes * 100 nodes * 20,000 * 4 threads = 1600MB
    protected transient int shuffleThreads = DEFAULT_SHUFFLE_THREADS;

    protected transient ExecutorService shuffleExecPool;
    protected transient BoundedArrayQueue<String> shuffleSink;

    protected HashMap<GridNode, MutableLong> assignMap;

    protected transient String csvFileName;
    private transient boolean isFirstShuffle = true;
    private transient Pair<PrimaryKey, Collection<ForeignKey>> primaryForeignKeys;

    @SuppressWarnings("unchecked")
    public CsvPartitioningTask(GridJob job, DBPartitioningJobConf jobConf) {
        super(job, false);
        this.jobConf = jobConf;
    }

    @Override
    public boolean injectResources() {
        return true;
    }

    public int shuffleUnits() {
        return shuffleUnits;
    }

    public void setShuffleUnits(int shuffleUnits) {
        this.shuffleUnits = shuffleUnits;
    }

    public int shuffleThreads() {
        return shuffleThreads;
    }

    public void setShuffleThreads(int shuffleThreads) {
        this.shuffleThreads = shuffleThreads;
    }

    protected HashMap<GridNode, MutableLong> execute() throws GridException {
        int numShuffleThreads = shuffleThreads();
        this.shuffleExecPool = (numShuffleThreads <= 0) ? new DirectExecutorService()
                : ExecutorFactory.newBoundedWorkQueueFixedThreadPool(numShuffleThreads, "Gridool#Shuffle", true,
                        new ThreadPoolExecutor.CallerRunsPolicy());
        this.shuffleSink = new BoundedArrayQueue<String>(shuffleUnits());
        this.csvFileName = generateCsvFileName();
        this.assignMap = new HashMap<GridNode, MutableLong>(64);

        // inquire primary foreign keys of the partitioning table
        DBAccessor dba = registry.getDbAccessor();
        String templateTableName = jobConf.getBaseTableName();
        this.primaryForeignKeys = GridDbUtils.getPrimaryForeignKeys(dba, templateTableName);

        // parse and shuffle a CSV file
        final CsvReader reader = getCsvReader(jobConf);
        int numShuffled = 0;
        try {
            String line;
            while ((line = reader.getNextLine()) != null) {
                shuffle(line);
                numShuffled++;
            }
        } catch (IOException e) {
            LOG.error(e);
            throw new GridException(e);
        } finally {
            IOUtils.closeQuietly(reader);
        }
        postShuffle(numShuffled);
        return assignMap;
    }

    private String generateCsvFileName() {
        assert (kernel != null);
        String tblName = jobConf.getTableName();
        GridNode node = getSenderNode();
        String addr = node.getPhysicalAdress().getHostAddress();
        return tblName + addr + ".csv";
    }

    private void shuffle(@Nonnull final String record) {
        assert (shuffleSink != null);
        if (!shuffleSink.offer(record)) {
            invokeShuffle(shuffleExecPool, shuffleSink);
            this.shuffleSink = new BoundedArrayQueue<String>(shuffleUnits());
            shuffleSink.offer(record);
        }
    }

    protected void postShuffle(int numShuffled) {
        if (!shuffleSink.isEmpty()) {
            invokeShuffle(shuffleExecPool, shuffleSink);
        }
        ExecutorUtils.shutdownAndAwaitTermination(shuffleExecPool);
    }

    private final void invokeShuffle(@Nonnull final ExecutorService shuffleExecPool,
            @Nonnull final ArrayQueue<String> queue) {
        assert (kernel != null);
        final String[] lines = queue.toArray(String.class);
        final String fileName = csvFileName;
        if (isFirstShuffle) {
            PartitioningJobConf conf = new PartitioningJobConf(lines, fileName, true, primaryForeignKeys, jobConf);
            runShuffleJob(kernel, conf, assignMap, deploymentGroup);
            this.isFirstShuffle = false;
        } else {
            shuffleExecPool.execute(new Runnable() {
                public void run() {
                    PartitioningJobConf conf = new PartitioningJobConf(lines, fileName, false, primaryForeignKeys,
                            jobConf);
                    runShuffleJob(kernel, conf, assignMap, deploymentGroup);
                }
            });
        }
    }

    private static void runShuffleJob(final GridKernel kernel, final PartitioningJobConf conf,
            final Map<GridNode, MutableLong> recMap, final String deploymentGroup) {
        PartitioningJobType jobType = conf.getJobConf().getJobType();
        Class<? extends GridJob<PartitioningJobConf, Map<GridNode, MutableInt>>> jobClass = jobType
                .getFirstPartitioningJobClass();
        //final GridJobFuture<Map<GridNode, MutableInt>> future = kernel.execute(CsvHashPartitioningJob.class, conf);
        //final GridJobFuture<Map<GridNode, MutableInt>> future = kernel.execute(GlobalCsvHashPartitioningJob.class, conf);
        final GridJobFuture<Map<GridNode, MutableInt>> future = kernel.execute(jobClass, conf);
        final Map<GridNode, MutableInt> map;
        try {
            map = future.get(); // wait for execution
        } catch (InterruptedException ie) {
            LOG.error(ie.getMessage(), ie);
            throw new IllegalStateException(ie);
        } catch (ExecutionException ee) {
            LOG.error(ee.getMessage(), ee);
            throw new IllegalStateException(ee);
        }
        synchronized (recMap) {
            for (final Map.Entry<GridNode, MutableInt> e : map.entrySet()) {
                GridNode node = e.getKey();
                MutableInt assigned = e.getValue();
                long v = assigned.longValue();
                MutableLong prev = recMap.get(node);
                if (prev == null) {
                    recMap.put(node, new MutableLong(v));
                } else {
                    prev.add(v);
                }
            }
        }
    }

    private static final CsvReader getCsvReader(final DBPartitioningJobConf jobConf) throws GridException {
        final String csvPath = jobConf.getCsvFilePath();
        final Reader reader;
        try {
            FileInputStream fis = new FileInputStream(csvPath);
            FastBufferedInputStream bis = new FastBufferedInputStream(fis, csvInputBufSize);
            reader = new InputStreamReader(bis, "UTF-8");
        } catch (FileNotFoundException fne) {
            LOG.error(fne);
            throw new GridException("CSV file not found: " + csvPath, fne);
        } catch (UnsupportedEncodingException uee) {
            LOG.error(uee);
            throw new IllegalStateException(uee); // should never happens
        }
        PushbackReader pushback = new PushbackReader(reader);
        return new SimpleCsvReader(pushback, jobConf.getFieldSeparator(), jobConf.getStringQuote());
    }

}