org.hxx.hadoop.GeneratorMapHbase.java Source code

Introduction

Here is the source code for org.hxx.hadoop.GeneratorMapHbase.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.hxx.hadoop;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
// rLogging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class GeneratorMapHbase extends Generator {
    public static final Logger LOG = LoggerFactory.getLogger(GeneratorMapHbase.class);
    static final String GENERATL_CNT = "generate.cnt";
    static final String GENERATL_TABLE = "generate.table";
    static final String GENERATL_REDUCENUM = "generate.reduceNum";

    /** Selects entries due for fetch. */
    public static class GenerateMark implements Mapper<Text, CrawlDatum, Text, CrawlDatum> {
        private boolean filter;
        private URLFilters filters;
        JobConf conf;

        private HConnection connection;
        private HTableInterface table;
        private long cnt = 0;
        private Map hostCnt = new HashMap();
        private int reduceNum = 1;
        int hostn = -1;
        public static String partStr = null;

        public void configure(JobConf job) {
            conf = job;
            filter = job.getBoolean(GENERATOR_FILTER, true);
            if (filter)
                filters = new URLFilters(job);

            String tableName = job.get(GENERATL_TABLE);
            try {
                connection = HConnectionManager.createConnection(job);
                table = connection.getTable(tableName);
                table.setAutoFlush(false, true);
                table.setWriteBufferSize(12 * 1024 * 1024);
            } catch (IOException e) {
                e.printStackTrace();
            }
            cnt = 0;
            reduceNum = job.getInt(GENERATL_REDUCENUM, 1);
            hostn = job.getInt(Generator.GENERATOR_MAX_COUNT, -1);
        }

        public void close() {
            try {
                table.flushCommits();
                table.close();
                connection.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
            partStr = getHostStr();
            LOG.info(partStr);
        }

        public void map(Text key, CrawlDatum value, OutputCollector<Text, CrawlDatum> output, Reporter reporter)
                throws IOException {
            if (filter) {
                // If filtering is on don't generate URLs that don't pass
                // URLFilters
                try {
                    if (filters.filter(key.toString()) == null)
                        return;
                } catch (URLFilterException e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Couldn't filter url: " + key + " (" + e.getMessage() + ")");
                    }
                }
            }

            if (!countHost(key.toString())) {
                return;
            }

            Put put = TableReader.generatedPut(key.getBytes(), value);
            table.put(put);
            if (++cnt % 10000 == 0) {
                table.flushCommits();
            }

            output.collect(key, value);// partition

            reporter.incrCounter("Generator", "records", 1);
        }

        private boolean countHost(String url) {
            String host = getHost(url);
            if (host != null) {
                if (hostCnt.containsKey(host)) {
                    AtomicLong cnt = (AtomicLong) hostCnt.get(host);
                    cnt.incrementAndGet();

                    if (hostn != -1) {
                        if (cnt.get() <= hostn)
                            return true;
                        else {
                            cnt.decrementAndGet();
                            return false;
                        }
                    }
                    return true;
                } else {
                    hostCnt.put(host, new AtomicLong(1));
                    return true;
                }
            }

            return false;
        }

        private String getHost(String url) {
            String host = null;
            try {
                URL tmp = new URL(url);
                host = tmp.getHost();
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }
            return host;
        }

        private String getHostStr() {
            StringBuilder sb = new StringBuilder("partTotal:");
            sb.append((long) (cnt / reduceNum)).append(",");
            for (Iterator iterator = hostCnt.keySet().iterator(); iterator.hasNext();) {
                String type = (String) iterator.next();
                sb.append(type).append(":").append(hostCnt.get(type)).append(",");
            }
            sb.deleteCharAt(sb.length() - 1);
            return sb.toString();
        }
    }

    public GeneratorMapHbase() {
    }

    public GeneratorMapHbase(Configuration conf) {
        setConf(conf);
    }

    /**
     * Generate fetchlists in one or more segments. Whether to filter URLs or
     * not is read from the crawl.generate.filter property in the configuration
     * files. If the property is not found, the URLs are filtered. Same for the
     * normalisation.
     * 
     * @param dbDir
     *            Crawl database directory
     * @param segments
     *            Segments directory
     * @param numLists
     *            Number of reduce tasks
     * @param topN
     *            Number of top URLs to be selected
     * @param curTime
     *            Current time in milliseconds
     * 
     * @return Path to generated segment or null if no entries were selected
     * 
     * @throws IOException
     *             When an I/O error occurs
     */
    public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
            boolean norm, boolean force, int maxNumSegments, int tableDepth) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("Generator: starting at " + sdf.format(start));
        LOG.info("Generator: Selecting best-scoring urls due for fetch.");
        LOG.info("Generator: filtering: " + filter);
        LOG.info("Generator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("Generator: topN: " + topN);
        }
        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        if (maxNumSegments == -1)
            maxNumSegments = 1;
        List<Path> generatedSegments = new ArrayList<Path>();
        int j = 0;// for table name
        String table = null;
        for (int i = 0; i < maxNumSegments; i++) {
            Path segment = null;

            long cnt = 0;
            while (cnt == 0) {// ???
                if (j++ == tableDepth)// depth?
                {
                    if (generatedSegments.size() > 0)
                        return generatedSegments.toArray(new Path[generatedSegments.size()]);
                    else
                        return null;
                }
                // ??
                segment = new Path(segments, Generator.generateSegmentName());
                long begin = System.currentTimeMillis();
                table = dbDir.getName() + j;
                RunningJob r = generateJob(table, segment, numLists, topN - cnt, curTime, filter, norm, force);
                Counter counter = r.getCounters().findCounter("Generator", "records");
                cnt += counter.getValue();
                LOG.info("Generator: records: " + cnt + " current table=" + table + " timeused="
                        + (System.currentTimeMillis() - begin));
            }
            generatedSegments.add(segment);
            j--;
        }

        long end = System.currentTimeMillis();
        LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

        if (generatedSegments.size() > 0)
            return generatedSegments.toArray(new Path[generatedSegments.size()]);
        else
            return null;
    }

    private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
            boolean filter, boolean norm, boolean force) throws IOException {
        LOG.info("Generator: segment: " + segment);

        JobConf job = new NutchJob(getConf());
        job.setJarByClass(GeneratorMapHbase.class);
        job.setJobName("generate: from " + table + " "
                + (new SimpleDateFormat("yyyyMMdd HH:mm:ss")).format(System.currentTimeMillis()));
        // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

        if (numLists == -1) {
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        numLists = 4;// TODO
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        // job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.set(GENERATL_TABLE, table);
        job.setInt(GENERATL_REDUCENUM, numLists);

        job.setInputFormat(TableTopInputFormat.class);// ?
        job.setMapperClass(GenerateMark.class);// generate?

        job.setPartitionerClass(URLCountPartitioner.class);
        job.setNumReduceTasks(numLists);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setOutputKeyComparatorClass(HashComparator.class);
        Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
        FileOutputFormat.setOutputPath(job, output);

        RunningJob r = null;
        try {
            r = JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }
        return r;
    }

    /**
     * Generate a fetchlist from the crawldb.
     */
    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new GeneratorMapHbase(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        long topN = 80000;
        for (int i = 0; i < args.length; i++) {
            if ("-topN".equals(args[i])) {
                topN = Long.parseLong(args[i + 1]);
                i++;
            }
        }

        try {
            Path[] segs = generate(new Path("/data/crawldb"), new Path("/data/segments"), 4, topN, 0, false, false,
                    false, 1, 1);
            if (segs == null)
                return -1;
        } catch (Exception e) {
            LOG.error("Generator: " + StringUtils.stringifyException(e));
            return -1;
        }
        return 0;
    }
}