org.apache.tajo.master.Repartitioner.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tajo.master.Repartitioner.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.tajo.master;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.Path;
import org.apache.tajo.QueryIdFactory;
import org.apache.tajo.SubQueryId;
import org.apache.tajo.catalog.*;
import org.apache.tajo.catalog.proto.CatalogProtos.StoreType;
import org.apache.tajo.catalog.statistics.TableStat;
import org.apache.tajo.conf.TajoConf.ConfVars;
import org.apache.tajo.engine.planner.PlannerUtil;
import org.apache.tajo.engine.planner.RangePartitionAlgorithm;
import org.apache.tajo.engine.planner.UniformRangePartition;
import org.apache.tajo.engine.planner.logical.*;
import org.apache.tajo.engine.utils.TupleUtil;
import org.apache.tajo.exception.InternalException;
import org.apache.tajo.master.ExecutionBlock.PartitionType;
import org.apache.tajo.master.QueryUnit.IntermediateEntry;
import org.apache.tajo.storage.Fragment;
import org.apache.tajo.storage.TupleRange;
import org.apache.tajo.util.TUtil;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.math.BigDecimal;
import java.net.URI;
import java.util.*;
import java.util.Map.Entry;

/**
 * Repartitioner creates non-leaf tasks and shuffles intermediate data.
 * It supports two repartition methods, such as hash and range repartition.
 */
public class Repartitioner {
    private static final Log LOG = LogFactory.getLog(Repartitioner.class);

    private static int HTTP_REQUEST_MAXIMUM_LENGTH = 1900;

    public static QueryUnit[] createJoinTasks(SubQuery subQuery) throws IOException {
        ExecutionBlock execBlock = subQuery.getBlock();
        CatalogService catalog = subQuery.getContext().getCatalog();

        ScanNode[] scans = execBlock.getScanNodes();
        Path tablePath;
        Fragment[] fragments = new Fragment[2];
        TableStat[] stats = new TableStat[2];

        // initialize variables from the child operators
        for (int i = 0; i < 2; i++) {
            // TODO - temporarily tables should be stored in temporarily catalog for each query
            if (scans[i].getTableId().startsWith(SubQueryId.PREFIX)) {
                tablePath = subQuery.getStorageManager().getTablePath(scans[i].getTableId());
            } else {
                tablePath = catalog.getTableDesc(scans[i].getTableId()).getPath();
            }

            if (scans[i].isLocal()) { // it only requires a dummy fragment.
                fragments[i] = new Fragment(scans[i].getTableId(), tablePath,
                        CatalogUtil.newTableMeta(scans[i].getInSchema(), StoreType.CSV), 0, 0, null);
            } else {
                fragments[i] = subQuery
                        .getStorageManager().getSplits(scans[i].getTableId(),
                                catalog.getTableDesc(scans[i].getTableId()).getMeta(), new Path(tablePath, "data"))
                        .get(0);
            }

            // Getting a table stat for each scan
            stats[i] = subQuery.getChildQuery(scans[i]).getTableStat();
        }

        // Assigning either fragments or fetch urls to query units
        QueryUnit[] tasks;
        if (scans[0].isBroadcast() || scans[1].isBroadcast()) {
            tasks = new QueryUnit[1];
            tasks[0] = new QueryUnit(QueryIdFactory.newQueryUnitId(subQuery.getId(), 0), false,
                    subQuery.getEventHandler());
            tasks[0].setLogicalPlan(execBlock.getPlan());
            tasks[0].setFragment(scans[0].getTableId(), fragments[0]);
            tasks[0].setFragment(scans[1].getTableId(), fragments[1]);
        } else {
            // The hash map is modeling as follows:
            // <Partition Id, <Table Name, Intermediate Data>>
            Map<Integer, Map<String, List<IntermediateEntry>>> hashEntries = new HashMap<Integer, Map<String, List<IntermediateEntry>>>();

            // Grouping IntermediateData by a partition key and a table name
            for (ScanNode scan : scans) {
                SubQuery childSubQuery = subQuery.getChildQuery(scan);
                for (QueryUnit task : childSubQuery.getQueryUnits()) {
                    if (task.getIntermediateData() != null) {
                        for (IntermediateEntry intermEntry : task.getIntermediateData()) {
                            if (hashEntries.containsKey(intermEntry.getPartitionId())) {
                                Map<String, List<IntermediateEntry>> tbNameToInterm = hashEntries
                                        .get(intermEntry.getPartitionId());

                                if (tbNameToInterm.containsKey(scan.getTableId())) {
                                    tbNameToInterm.get(scan.getTableId()).add(intermEntry);
                                } else {
                                    tbNameToInterm.put(scan.getTableId(), TUtil.newList(intermEntry));
                                }
                            } else {
                                Map<String, List<IntermediateEntry>> tbNameToInterm = new HashMap<String, List<IntermediateEntry>>();
                                tbNameToInterm.put(scan.getTableId(), TUtil.newList(intermEntry));
                                hashEntries.put(intermEntry.getPartitionId(), tbNameToInterm);
                            }
                        }
                    }
                }
            }

            LOG.info("Outer Intermediate Volume: " + stats[0].getNumBytes());
            LOG.info("Inner Intermediate Volume: " + stats[1].getNumBytes());

            // Getting the desire number of join tasks according to the volumn
            // of a larger table
            int largerIdx = stats[0].getNumBytes() >= stats[1].getNumBytes() ? 0 : 1;
            int desireJoinTaskVolumn = subQuery.getContext().getConf().getIntVar(ConfVars.JOIN_TASK_VOLUME);

            // calculate the number of tasks according to the data size
            int mb = (int) Math.ceil((double) stats[largerIdx].getNumBytes() / 1048576);
            LOG.info("Larger intermediate data is approximately " + mb + " MB");
            // determine the number of task per 64MB
            int maxTaskNum = (int) Math.ceil((double) mb / desireJoinTaskVolumn);
            LOG.info("The calculated number of tasks is " + maxTaskNum);
            LOG.info("The number of total partition keys is " + hashEntries.size());
            // the number of join tasks cannot be larger than the number of
            // distinct partition ids.
            int joinTaskNum = Math.min(maxTaskNum, hashEntries.size());
            LOG.info("The determined number of join tasks is " + joinTaskNum);
            QueryUnit[] createdTasks = newEmptyJoinTask(subQuery, fragments, joinTaskNum);

            // Assign partitions to tasks in a round robin manner.
            int i = 0;
            for (Entry<Integer, Map<String, List<IntermediateEntry>>> entry : hashEntries.entrySet()) {
                addJoinPartition(createdTasks[i++], subQuery, entry.getKey(), entry.getValue());
                if (i >= joinTaskNum) {
                    i = 0;
                }
            }

            List<QueryUnit> filteredTasks = new ArrayList<QueryUnit>();
            for (QueryUnit task : createdTasks) {
                // if there are at least two fetches, the join is possible.
                if (task.getFetches().size() > 1) {
                    filteredTasks.add(task);
                }
            }

            tasks = filteredTasks.toArray(new QueryUnit[filteredTasks.size()]);
        }

        return tasks;
    }

    private static QueryUnit[] newEmptyJoinTask(SubQuery subQuery, Fragment[] fragments, int taskNum) {
        ExecutionBlock execBlock = subQuery.getBlock();
        QueryUnit[] tasks = new QueryUnit[taskNum];
        for (int i = 0; i < taskNum; i++) {
            tasks[i] = new QueryUnit(QueryIdFactory.newQueryUnitId(subQuery.getId(), i), execBlock.isLeafBlock(),
                    subQuery.getEventHandler());
            tasks[i].setLogicalPlan(execBlock.getPlan());
            for (Fragment fragment : fragments) {
                tasks[i].setFragment2(fragment);
            }
        }

        return tasks;
    }

    private static void addJoinPartition(QueryUnit task, SubQuery subQuery, int partitionId,
            Map<String, List<IntermediateEntry>> grouppedPartitions) {

        for (ScanNode scanNode : subQuery.getBlock().getScanNodes()) {
            Map<String, List<IntermediateEntry>> requests;
            if (grouppedPartitions.containsKey(scanNode.getTableId())) {
                requests = mergeHashPartitionRequest(grouppedPartitions.get(scanNode.getTableId()));
            } else {
                return;
            }
            Set<URI> fetchURIs = TUtil.newHashSet();
            for (Entry<String, List<IntermediateEntry>> requestPerNode : requests.entrySet()) {
                Collection<URI> uris = createHashFetchURL(requestPerNode.getKey(),
                        subQuery.getChildQuery(scanNode).getId(), partitionId, PartitionType.HASH,
                        requestPerNode.getValue());
                fetchURIs.addAll(uris);
            }
            task.addFetches(scanNode.getTableId(), fetchURIs);
        }
    }

    /**
     * This method merges the partition request associated with the pullserver's address.
     * It reduces the number of TCP connections.
     *
     * @return key: pullserver's address, value: a list of requests
     */
    private static Map<String, List<IntermediateEntry>> mergeHashPartitionRequest(
            List<IntermediateEntry> partitions) {
        Map<String, List<IntermediateEntry>> mergedPartitions = new HashMap<String, List<IntermediateEntry>>();
        for (IntermediateEntry partition : partitions) {
            if (mergedPartitions.containsKey(partition.getPullAddress())) {
                mergedPartitions.get(partition.getPullAddress()).add(partition);
            } else {
                mergedPartitions.put(partition.getPullAddress(), TUtil.newList(partition));
            }
        }

        return mergedPartitions;
    }

    public static QueryUnit[] createNonLeafTask(SubQuery subQuery, SubQuery childSubQuery, int maxNum)
            throws InternalException {
        ExecutionBlock childExecBlock = childSubQuery.getBlock();
        if (childExecBlock.getPartitionType() == PartitionType.HASH) {
            return createHashPartitionedTasks(subQuery, childSubQuery, maxNum);
        } else if (childExecBlock.getPartitionType() == PartitionType.RANGE) {
            return createRangePartitionedTasks(subQuery, childSubQuery, maxNum);
        } else {
            throw new InternalException("Cannot support partition type");
        }
    }

    public static QueryUnit[] createRangePartitionedTasks(SubQuery subQuery, SubQuery childSubQuery, int maxNum)
            throws InternalException {
        ExecutionBlock execBlock = subQuery.getBlock();
        TableStat stat = childSubQuery.getTableStat();
        if (stat.getNumRows() == 0) {
            return new QueryUnit[0];
        }

        ScanNode scan = execBlock.getScanNodes()[0];
        Path tablePath;
        tablePath = subQuery.getContext().getStorageManager().getTablePath(scan.getTableId());

        StoreTableNode store = (StoreTableNode) childSubQuery.getBlock().getPlan();
        SortNode sort = (SortNode) store.getSubNode();
        SortSpec[] sortSpecs = sort.getSortKeys();
        Schema sortSchema = PlannerUtil.sortSpecsToSchema(sort.getSortKeys());

        // calculate the number of maximum query ranges
        TupleRange mergedRange = TupleUtil.columnStatToRange(sort.getOutSchema(), sortSchema,
                stat.getColumnStats());
        RangePartitionAlgorithm partitioner = new UniformRangePartition(sortSchema, mergedRange);
        BigDecimal card = partitioner.getTotalCardinality();

        // if the number of the range cardinality is less than the desired number of tasks,
        // we set the the number of tasks to the number of range cardinality.
        int determinedTaskNum;
        if (card.compareTo(new BigDecimal(maxNum)) < 0) {
            LOG.info("The range cardinality (" + card + ") is less then the desired number of tasks (" + maxNum
                    + ")");
            determinedTaskNum = card.intValue();
        } else {
            determinedTaskNum = maxNum;
        }

        LOG.info("Try to divide " + mergedRange + " into " + determinedTaskNum + " sub ranges (total units: "
                + determinedTaskNum + ")");
        TupleRange[] ranges = partitioner.partition(determinedTaskNum);

        Fragment dummyFragment = new Fragment(scan.getTableId(), tablePath,
                CatalogUtil.newTableMeta(scan.getInSchema(), StoreType.CSV), 0, 0, null);

        List<String> basicFetchURIs = new ArrayList<String>();

        SubQuery child = childSubQuery.getContext().getSubQuery(subQuery.getBlock().getChildBlock(scan).getId());
        for (QueryUnit qu : child.getQueryUnits()) {
            for (IntermediateEntry p : qu.getIntermediateData()) {
                String uri = createBasicFetchUri(p.getPullHost(), p.getPullPort(), childSubQuery.getId(), p.taskId,
                        p.attemptId);
                basicFetchURIs.add(uri);
            }
        }

        boolean ascendingFirstKey = sortSpecs[0].isAscending();
        SortedMap<TupleRange, Set<URI>> map;
        if (ascendingFirstKey) {
            map = new TreeMap<TupleRange, Set<URI>>();
        } else {
            map = new TreeMap<TupleRange, Set<URI>>(new TupleRange.DescendingTupleRangeComparator());
        }

        Set<URI> uris;
        try {
            for (int i = 0; i < ranges.length; i++) {
                uris = new HashSet<URI>();
                for (String uri : basicFetchURIs) {
                    String rangeParam = TupleUtil.rangeToQuery(sortSchema, ranges[i], ascendingFirstKey,
                            ascendingFirstKey ? i == (ranges.length - 1) : i == 0);
                    URI finalUri = URI.create(uri + "&" + rangeParam);
                    uris.add(finalUri);
                }
                map.put(ranges[i], uris);
            }

        } catch (UnsupportedEncodingException e) {
            LOG.error(e);
        }

        QueryUnit[] tasks = createEmptyNonLeafTasks(subQuery, determinedTaskNum, dummyFragment);
        assignPartitionByRoundRobin(map, scan.getTableId(), tasks);
        return tasks;
    }

    public static QueryUnit[] assignPartitionByRoundRobin(Map<?, Set<URI>> partitions, String tableName,
            QueryUnit[] tasks) {
        int tid = 0;
        for (Entry<?, Set<URI>> entry : partitions.entrySet()) {
            for (URI uri : entry.getValue()) {
                tasks[tid].addFetch(tableName, uri);
            }

            if (tid >= tasks.length) {
                tid = 0;
            } else {
                tid++;
            }
        }

        return tasks;
    }

    public static String createBasicFetchUri(String hostName, int port, SubQueryId childSid, int taskId,
            int attemptId) {
        String scheme = "http://";
        StringBuilder sb = new StringBuilder(scheme);
        sb.append(hostName).append(":").append(port).append("/?").append("sid=").append(childSid.getId())
                .append("&").append("ta=").append(taskId).append("_").append(attemptId).append("&").append("p=0")
                .append("&").append("type=r");

        return sb.toString();
    }

    public static QueryUnit[] createHashPartitionedTasks(SubQuery subQuery, SubQuery childSubQuery, int maxNum) {
        ExecutionBlock execBlock = subQuery.getBlock();
        TableStat stat = childSubQuery.getTableStat();
        if (stat.getNumRows() == 0) {
            return new QueryUnit[0];
        }

        ScanNode scan = execBlock.getScanNodes()[0];
        Path tablePath;
        tablePath = subQuery.getContext().getStorageManager().getTablePath(scan.getTableId());

        List<IntermediateEntry> partitions = new ArrayList<IntermediateEntry>();
        for (QueryUnit tasks : childSubQuery.getQueryUnits()) {
            if (tasks.getIntermediateData() != null) {
                partitions.addAll(tasks.getIntermediateData());
            }
        }

        Fragment frag = new Fragment(scan.getTableId(), tablePath,
                CatalogUtil.newTableMeta(scan.getInSchema(), StoreType.CSV), 0, 0, null);

        Map<Integer, List<IntermediateEntry>> hashed = hashByKey(partitions);
        Map<String, List<IntermediateEntry>> hashedByHost;
        Map<Integer, List<URI>> finalFetchURI = new HashMap<Integer, List<URI>>();

        for (Entry<Integer, List<IntermediateEntry>> interm : hashed.entrySet()) {
            hashedByHost = hashByHost(interm.getValue());
            for (Entry<String, List<IntermediateEntry>> e : hashedByHost.entrySet()) {
                Collection<URI> uris = createHashFetchURL(e.getKey(), childSubQuery.getId(), interm.getKey(),
                        childSubQuery.getBlock().getPartitionType(), e.getValue());

                if (finalFetchURI.containsKey(interm.getKey())) {
                    finalFetchURI.get(interm.getKey()).addAll(uris);
                } else {
                    finalFetchURI.put(interm.getKey(), TUtil.newList(uris));
                }
            }
        }

        GroupbyNode groupby = (GroupbyNode) childSubQuery.getBlock().getStoreTableNode().getSubNode();
        // the number of tasks cannot exceed the number of merged fetch uris.
        int determinedTaskNum = Math.min(maxNum, finalFetchURI.size());
        if (groupby.getGroupingColumns().length == 0) {
            determinedTaskNum = 1;
        }

        QueryUnit[] tasks = createEmptyNonLeafTasks(subQuery, determinedTaskNum, frag);

        int tid = 0;
        for (Entry<Integer, List<URI>> entry : finalFetchURI.entrySet()) {
            for (URI uri : entry.getValue()) {
                tasks[tid].addFetch(scan.getTableId(), uri);
            }

            tid++;

            if (tid == tasks.length) {
                tid = 0;
            }
        }

        return tasks;
    }

    public static Collection<URI> createHashFetchURL(String hostAndPort, SubQueryId childSid, int partitionId,
            PartitionType type, List<IntermediateEntry> entries) {
        String scheme = "http://";
        StringBuilder urlPrefix = new StringBuilder(scheme);
        urlPrefix.append(hostAndPort).append("/?").append("sid=").append(childSid.getId()).append("&").append("p=")
                .append(partitionId).append("&").append("type=");
        if (type == PartitionType.HASH) {
            urlPrefix.append("h");
        } else if (type == PartitionType.RANGE) {
            urlPrefix.append("r");
        }
        urlPrefix.append("&ta=");

        // If the get request is longer than 2000 characters,
        // the long request uri may cause HTTP Status Code - 414 Request-URI Too Long.
        // Refer to http://www.w3.org/Protocols/rfc2616/rfc2616-sec10.html#sec10.4.15
        // The below code transforms a long request to multiple requests.
        List<String> taskIdsParams = new ArrayList<String>();
        boolean first = true;
        StringBuilder taskIdListBuilder = new StringBuilder();
        for (IntermediateEntry entry : entries) {
            StringBuilder taskAttemptId = new StringBuilder();

            if (!first) { // when comma is added?
                taskAttemptId.append(",");
            } else {
                first = false;
            }

            taskAttemptId.append(entry.getTaskId()).append("_").append(entry.getAttemptId());
            if (taskIdListBuilder.length() + taskAttemptId.length() > HTTP_REQUEST_MAXIMUM_LENGTH) {
                taskIdsParams.add(taskIdListBuilder.toString());
                taskIdListBuilder = new StringBuilder(entry.getTaskId() + "_" + entry.getAttemptId());
            } else {
                taskIdListBuilder.append(taskAttemptId);
            }
        }

        // if the url params remain
        if (taskIdListBuilder.length() > 0) {
            taskIdsParams.add(taskIdListBuilder.toString());
        }

        Collection<URI> fetchURLs = new ArrayList<URI>();
        for (String param : taskIdsParams) {
            fetchURLs.add(URI.create(urlPrefix + param));
        }

        return fetchURLs;
    }

    public static Map<Integer, List<IntermediateEntry>> hashByKey(List<IntermediateEntry> entries) {
        Map<Integer, List<IntermediateEntry>> hashed = new HashMap<Integer, List<IntermediateEntry>>();
        for (IntermediateEntry entry : entries) {
            if (hashed.containsKey(entry.getPartitionId())) {
                hashed.get(entry.getPartitionId()).add(entry);
            } else {
                hashed.put(entry.getPartitionId(), TUtil.newList(entry));
            }
        }

        return hashed;
    }

    public static QueryUnit[] createEmptyNonLeafTasks(SubQuery subQuery, int num, Fragment frag) {
        LogicalNode plan = subQuery.getBlock().getPlan();
        QueryUnit[] tasks = new QueryUnit[num];
        for (int i = 0; i < num; i++) {
            tasks[i] = new QueryUnit(QueryIdFactory.newQueryUnitId(subQuery.getId(), i), false,
                    subQuery.getEventHandler());
            tasks[i].setFragment2(frag);
            tasks[i].setLogicalPlan(plan);
        }
        return tasks;
    }

    public static Map<String, List<IntermediateEntry>> hashByHost(List<IntermediateEntry> entries) {
        Map<String, List<IntermediateEntry>> hashed = new HashMap<String, List<IntermediateEntry>>();

        String hostName;
        for (IntermediateEntry entry : entries) {
            hostName = entry.getPullHost() + ":" + entry.getPullPort();
            if (hashed.containsKey(hostName)) {
                hashed.get(hostName).add(entry);
            } else {
                hashed.put(hostName, TUtil.newList(entry));
            }
        }

        return hashed;
    }

    public static SubQuery setPartitionNumberForTwoPhase(SubQuery subQuery, final int n) {
        ExecutionBlock execBlock = subQuery.getBlock();
        Column[] keys = null;
        // if the next query is join,
        // set the partition number for the current logicalUnit
        // TODO: the union handling is required when a join has unions as its child
        ExecutionBlock parentBlock = execBlock.getParentBlock();
        if (parentBlock != null) {
            if (parentBlock.getStoreTableNode().getSubNode().getType() == ExprType.JOIN) {
                execBlock.getStoreTableNode().setPartitions(execBlock.getPartitionType(),
                        execBlock.getStoreTableNode().getPartitionKeys(), n);
                keys = execBlock.getStoreTableNode().getPartitionKeys();
            }
        }

        StoreTableNode store = execBlock.getStoreTableNode();
        // set the partition number for group by and sort
        if (execBlock.getPartitionType() == PartitionType.HASH) {
            if (store.getSubNode().getType() == ExprType.GROUP_BY) {
                GroupbyNode groupby = (GroupbyNode) store.getSubNode();
                keys = groupby.getGroupingColumns();
            }
        } else if (execBlock.getPartitionType() == PartitionType.RANGE) {
            if (store.getSubNode().getType() == ExprType.SORT) {
                SortNode sort = (SortNode) store.getSubNode();
                keys = new Column[sort.getSortKeys().length];
                for (int i = 0; i < keys.length; i++) {
                    keys[i] = sort.getSortKeys()[i].getSortKey();
                }
            }
        }
        if (keys != null) {
            if (keys.length == 0) {
                store.setPartitions(execBlock.getPartitionType(), new Column[] {}, 1);
            } else {
                store.setPartitions(execBlock.getPartitionType(), keys, n);
            }
        } else {
            store.setListPartition();
        }
        return subQuery;
    }
}