org.hxx.hadoop.URLCountPartitioner.java Source code

Java tutorial

Introduction

Here is the source code for org.hxx.hadoop.URLCountPartitioner.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.hxx.hadoop;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.net.URLNormalizers;

/**
 * Partition urls by host, domain name or IP depending on the value of the
 * parameter 'partition.url.mode' which can be 'byHost', 'byDomain' or 'byIP'
 */
public class URLCountPartitioner implements Partitioner<Text, Writable> {
    private int seed;
    private URLNormalizers normalizers;

    private Map partHostNums = new HashMap();
    private Map hostParts2 = new HashMap();
    private Map hostParts = new HashMap();
    private long topn = 100000;
    private int hostn = -1;
    private String cntStr;
    private boolean initPart = false;

    public void close() {
    }

    public void configure(JobConf job) {
        seed = job.getInt("partition.url.seed", 0);
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_PARTITION);

        topn = job.getLong(Generator.GENERATOR_TOP_N, 100000);
        hostn = job.getInt(Generator.GENERATOR_MAX_COUNT, -1);
        // cntStr = job.get(GeneratorHbase.GENERATL_CNT);// ?
        // int reduceNum = job.getInt(GeneratorHbase.GENERATL_REDUCENUM, 1);
        // initPart(reduceNum);
    }

    /** Hash by domain name. */
    public int getPartition(Text key, Writable value, int numReduceTasks) {
        // if (!initPart) {
        // initPart(numReduceTasks);
        // GeneratorHbase.LOG.info("*************************************");
        // }

        // if (cntStr == null || cntStr.length() == 0)
        return getHostPartition(key, numReduceTasks);

        // return getPartition(key, numReduceTasks);
    }

    public static void main(String[] args) {
        URLCountPartitioner urlCountPartitioner = new URLCountPartitioner();
        JobConf job = new JobConf();

        // job.set("generate.cnt",
        // "partTotal:20000,10.200.4.136:2319,10.200.6.207:20001,10.200.4.141:9408,10.200.5.85:18004,10.200.7.13:10266,10.200.7.225:20002");
        job.set("generate.cnt",
                "partTotal:20000,10.200.4.136:953,10.200.6.207:20001,10.200.4.141:8578,10.200.7.22:20001,10.200.7.13:20001,10.200.7.225:10466");
        job.setInt("generate.reduceNum", 4);

        urlCountPartitioner.configure(job);
    }

    private void initPart(int reduceNum) {
        if (cntStr != null && cntStr.length() != 0) {
            initPart = true;

            String[] tmp = cntStr.split(",");
            long partLimit = 0;
            List<String[]> hostCntList = new ArrayList<String[]>();
            Map hostNumValues = new HashMap();
            for (int i = 0; i < tmp.length; i++) {
                String[] hostCnt = tmp[i].split(":");
                if ("partTotal".equals(hostCnt[0]))
                    partLimit = Long.parseLong(hostCnt[1]);
                else {
                    hostCntList.add(hostCnt);
                    hostNumValues.put(hostCnt[0], Long.parseLong(hostCnt[1]));
                }
            }

            Collections.sort(hostCntList, new Comparator<String[]>() {
                public int compare(String[] o1, String[] o2) {
                    return Long.valueOf(o2[1]).compareTo(Long.valueOf(o1[1]));
                }
            });// ??

            // List<String[]> bigHosts = hostCntList;
            // if (hostCntList.size() > reduceNum) {
            // bigHosts = hostCntList.subList(0, reduceNum);
            // hostCntList = hostCntList.subList(reduceNum, hostCntList.size());
            // }
            for (int i = 0; i < reduceNum; i++) {// 
                if (hostCntList.size() > i) {
                    String[] current = hostCntList.get(i);
                    long curNum = Long.parseLong(current[1]);
                    String host = current[0];
                    Integer part = Integer.valueOf(i);
                    hostParts.put(host, part);

                    approachLimit(partLimit, curNum, hostCntList, part);
                }
            }

            testPrint(hostNumValues);
        }
    }

    private void testPrint(Map hostNumValue) {
        Map partHosts = new HashMap();
        for (Iterator iterator = hostParts.keySet().iterator(); iterator.hasNext();) {
            String host = (String) iterator.next();
            Integer part = (Integer) hostParts.get(host);

            if (partHosts.containsKey(part)) {
                partHosts.put(part, ((Long) partHosts.get(part)) + ((Long) hostNumValue.get(host)));
            } else {
                partHosts.put(part, (Long) hostNumValue.get(host));
            }
        }

        GeneratorMapHbase.LOG.info(hostNumValue.toString());
        GeneratorMapHbase.LOG.info(hostParts.toString());
        GeneratorMapHbase.LOG.info(partHosts.toString());
    }

    // partLimit > total
    private void approachLimit(long partLimit, long total, List<String[]> data, Integer part) {
        if (data == null || data.isEmpty())
            return;

        long distance = partLimit - total;
        if (distance <= 0)
            return;

        // 
        for (int i = 0; i < data.size(); i++) {
            String[] current = data.get(i);
            long curNum = Long.parseLong(current[1]);
            String host = current[0];

            if (curNum == distance) {// ??
                hostParts.put(host, part);
                data.remove(i);// 
                return;
            } else if (curNum < distance) {// ??
                hostParts.put(host, part);
                data.remove(i);// 
                approachLimit(partLimit, total + curNum, data, part);// 
                return;
            }
        }

    }

    private int getPartition(Text key, int numReduceTasks) {
        String urlString = key.toString();
        String host = null;
        try {
            urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
            URL url = new URL(urlString);
            host = url.getHost();
        } catch (MalformedURLException e) {
            GeneratorMapHbase.LOG.warn("Malformed URL: '" + urlString + "'");
        }

        if (host == null)
            return getHostPartition(key, numReduceTasks);

        return ((Integer) hostParts.get(host)).intValue();
    }

    /** Hash by domain name. */
    private int getHashPartition(Text key, Writable value, int numReduceTasks) {
        String urlString = key.toString();
        URL url = null;
        int hashCode = urlString.hashCode();
        try {
            urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
            url = new URL(urlString);
            hashCode = url.getHost().hashCode();
        } catch (MalformedURLException e) {
            GeneratorMapHbase.LOG.warn("Malformed URL: '" + urlString + "'");
        }
        // make hosts wind up in different partitions on different runs
        hashCode ^= seed;

        return (hashCode & Integer.MAX_VALUE) % numReduceTasks;
    }

    private int getHostPartition(Text key, int numReduceTasks) {
        String urlString = key.toString();
        String host = null;
        int hashCode = urlString.hashCode();
        try {
            urlString = normalizers.normalize(urlString, URLNormalizers.SCOPE_PARTITION);
            URL url = new URL(urlString);
            host = url.getHost();
            hashCode = host.hashCode();
        } catch (MalformedURLException e) {
            GeneratorMapHbase.LOG.warn("Malformed URL: '" + urlString + "'");
        }

        // make hosts wind up in different partitions on different runs
        hashCode ^= seed;
        int oriPart = (hashCode & Integer.MAX_VALUE) % numReduceTasks;

        long partLimit = numReduceTasks;// ?
        if (hostn != -1) {
            partLimit = hostn;
        }

        return getPart(host, oriPart, partLimit, numReduceTasks);
    }

    private int getPart(String host, int oriPart, long partLimit, int numReduceTasks) {
        Integer partKey = Integer.valueOf(oriPart);// key
        if (partHostNums.containsKey(partKey)) {// 
            Map hostNums = (Map) partHostNums.get(partKey);
            AtomicLong total = (AtomicLong) hostNums.get("total");
            if (total.get() >= partLimit) {// ?
                if (hostNums.containsKey(host)) {
                    addHost(host, hostNums);
                    return oriPart;
                } else {// host ??
                    int part = getHostPart(numReduceTasks, host);
                    addNewPart(host, part);
                    return part;
                }
            } else {// ?
                addHost(host, hostNums);
                return oriPart;
            }
        } else {// ?
            addNewPart(host, partKey);
        }

        return oriPart;
    }

    // host
    private void addHost(String host, Map hostNums) {
        ((AtomicLong) hostNums.get("total")).incrementAndGet();
        if (hostNums.containsKey(host)) {
            ((AtomicLong) hostNums.get(host)).incrementAndGet();
        } else {
            AtomicLong num = new AtomicLong(1);
            hostNums.put(host, num);
        }
    }

    // 
    private void addNewPart(String host, Integer part) {
        Map hostNums = new HashMap();
        AtomicLong num = new AtomicLong(1);
        hostNums.put(host, num);
        hostNums.put("total", new AtomicLong(1));
        partHostNums.put(part, hostNums);
    }

    // ?host, ??
    private int getHostPart(int numReduceTasks, String host) {
        if (hostParts2.containsKey(host)) {
            return (Integer) hostParts2.get(host);
        } else {
            long min = Long.MAX_VALUE;
            int part = -1;
            for (int i = 0; i < numReduceTasks; i++) {// ??host
                Integer partKey = Integer.valueOf(i);
                if (partHostNums.containsKey(partKey)) {
                    if (((Map) partHostNums.get(partKey)).containsKey(host)) {
                        part = i;
                        break;
                    }
                }
            }
            if (part == -1) {// not found
                for (int i = 0; i < numReduceTasks; i++) {
                    Integer partKey = Integer.valueOf(i);
                    if (partHostNums.containsKey(partKey)) {
                        Map hostNum = (Map) partHostNums.get(partKey);
                        AtomicLong total = (AtomicLong) hostNum.get("total");
                        if (total.get() < min) {
                            min = total.get();
                            part = i;
                        }
                    } else {
                        // ??
                        part = i;
                        break;
                    }
                }
            }

            hostParts2.put(host, Integer.valueOf(part));
            return part;
        }
    }
}