org.hxx.hadoop.GeneratorRedHbase.java Source code

Introduction

Here is the source code for org.hxx.hadoop.GeneratorRedHbase.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.hxx.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.Random;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.PageFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
// rLogging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.hxx.hadoop.stat.GenerateInfo;
import org.hxx.hadoop.stat.GenerateInfos;
import org.hxx.hbase.HostFilter;

/**
 * Generates a subset of a crawl db to fetch. This version allows to generate
 * fetchlists for several segments in one go. Unlike in the initial version
 * (OldGenerator), the IP resolution is done ONLY on the entries which have been
 * selected for fetching. The URLs are partitioned by IP, domain or host within
 * a segment. We can chose separately how to count the URLS i.e. by domain or
 * host to limit the entries.
 **/
public class GeneratorRedHbase extends Generator {
    public static final Logger LOG = LoggerFactory.getLogger(GeneratorRedHbase.class);
    static final String GENERATL_TABLE = "generate.table";
    static final String GENERATL_REDUCENUM = "generate.reduceNum";

    public static class CodeInputFormat implements InputFormat<IntWritable, IntWritable> {
        public static class CodeReader implements RecordReader<IntWritable, IntWritable> {
            private int current = 0;
            private int reduceNum = 1;

            public CodeReader(int reduceNum) {
                super();
                this.reduceNum = reduceNum;
            }

            public boolean next(IntWritable key, IntWritable value) throws IOException {
                if (current == reduceNum)
                    return false;

                key.set(current);
                value.set(current);
                current++;

                return true;
            }

            public IntWritable createKey() {
                return new IntWritable(0);
            }

            public IntWritable createValue() {
                return new IntWritable(0);
            }

            public long getPos() throws IOException {
                return current;
            }

            public void close() throws IOException {
            }

            public float getProgress() throws IOException {
                return current / reduceNum;
            }
        }

        public static class CustomInputSplit extends org.apache.hadoop.mapreduce.InputSplit implements InputSplit {
            private int reduceNum = 1;

            public CustomInputSplit() {
                super();
            }

            public CustomInputSplit(int reduceNum) {
                super();
                this.reduceNum = reduceNum;
            }

            public void write(DataOutput out) throws IOException {
                out.writeInt(reduceNum);
            }

            public void readFields(DataInput in) throws IOException {
                reduceNum = in.readInt();
            }

            public long getLength() throws IOException {
                return reduceNum;// reduce num?
            }

            public String[] getLocations() throws IOException {
                return new String[] {};
            }
        }

        public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
            return new CustomInputSplit[] {
                    new CustomInputSplit(job.getInt(GeneratorRedHbase.GENERATL_REDUCENUM, 1)) };
        }

        public RecordReader<IntWritable, IntWritable> getRecordReader(InputSplit split, JobConf job,
                Reporter reporter) throws IOException {
            return new CodeReader(job.getInt(GeneratorRedHbase.GENERATL_REDUCENUM, 1));
        }
    }

    /** Selects entries due for fetch. */
    public static class GenerateMark implements Reducer<IntWritable, IntWritable, Text, CrawlDatum> {
        private boolean filter;
        private URLFilters filters;
        JobConf conf;
        private Map hostCnt = new HashMap();
        private int rsCache = 5000;
        private long topn = rsCache;
        private int hostn = -1;
        private long generateTime;
        private int reduceNum = 1;
        private int seed;

        private HConnection connection;
        private HTableInterface table;
        private long cnt = 0;

        private long curTimeMillis = 0;
        private long lastGenTimeMillis = 0;
        private boolean isSmart = false;

        static long GetZeroTimeSeconds() {
            Calendar c = Calendar.getInstance();
            try {
                c.setTime(new SimpleDateFormat("yyyy-MM-dd").parse("2014-01-01"));
            } catch (ParseException e) {
                e.printStackTrace();
            }
            return c.getTimeInMillis() / 1000L;
        }

        static final long ZeroTimeMillis = GetZeroTimeSeconds() * 1000L; // ??;

        public void configure(JobConf job) {
            conf = job;
            filter = job.getBoolean(GENERATOR_FILTER, true);
            if (filter)
                filters = new URLFilters(job);

            reduceNum = job.getInt(GeneratorRedHbase.GENERATL_REDUCENUM, 1);
            topn = job.getLong(Generator.GENERATOR_TOP_N, rsCache);
            hostn = job.getInt(Generator.GENERATOR_MAX_COUNT, -1);
            generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
            seed = job.getInt("partition.url.seed", 0);

            String tableName = job.get(GENERATL_TABLE);
            HBaseConfiguration.merge(conf, HBaseConfiguration.create(conf));
            try {
                connection = HConnectionManager.createConnection(job);
                table = connection.getTable(tableName);
                table.setAutoFlush(false, true);
                table.setWriteBufferSize(12 * 1024 * 1024);
            } catch (IOException e) {
                e.printStackTrace();
            }

            isSmart = job.getBoolean("nutch.smart.is", false);
            if (isSmart) {
            }
        }

        public void close() {
            try {
                table.flushCommits();
                table.close();
                connection.close();
            } catch (IOException e) {
                e.printStackTrace();
            }

            LOG.info("total=" + cnt + "; " + hostCnt);
        }

        @Override
        public void reduce(IntWritable key, Iterator<IntWritable> values, OutputCollector<Text, CrawlDatum> output,
                Reporter reporter) throws IOException {
            LOG.info("generatorHbase:load url from partition=" + key.get());

            int part = key.get();
            long partTopn = topn / reduceNum;// part topn

            ResultScanner rs = getRS();
            if (rs == null) {
                return;
            }

            for (Result r : rs) {
                if (cnt == partTopn) {
                    return;
                }
                if (r == null || r.isEmpty()) {
                    return;
                }

                byte[] urlByte = r.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("url"));
                if (!filteUrl(Bytes.toString(urlByte), part)) {
                    continue;
                }

                Text urlKey = new Text(urlByte);
                CrawlDatum value = new CrawlDatum();
                createValue(value, r);

                // if (!smartFilter(value))
                // continue;

                Put put = generatedPut(urlKey.getBytes(), value);
                table.put(put);
                if (++cnt % 10000 == 0) {
                    table.flushCommits();
                }

                output.collect(urlKey, value);// OldOutputCollector
                reporter.incrCounter("Generator", "records", 1);
            }

        }

        private boolean smartFilter(CrawlDatum value) throws IOException {
            if (isSmart) {// ??
                // ?[lastGenTime,curTime]?URL
                int interval = value.getFetchInterval();
                if (curTimeMillis / interval <= lastGenTimeMillis / interval)
                    return false;// ???
            }

            return true;
        }

        private boolean filteUrl(String url, int part) {
            if (filter) {
                // don't generate URLs that don't pass URLFilters
                try {
                    if (filters.filter(url) == null)
                        return false;
                } catch (URLFilterException e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
                    }
                    return false;
                }
            }

            String host = getHost(url);
            if (part != getHashPartition(url, host))// belong to the partition?
                return false;

            if (host != null) {
                if (hostCnt.containsKey(host)) {
                    AtomicLong cnt = (AtomicLong) hostCnt.get(host);
                    cnt.incrementAndGet();

                    if (hostn != -1) {
                        if (cnt.get() <= hostn)
                            return true;
                        else {
                            cnt.decrementAndGet();
                            return false;
                        }
                    }
                    return true;
                } else {
                    hostCnt.put(host, new AtomicLong(1));
                    return true;
                }
            }

            return false;
        }

        private String getHost(String url) {
            String host = null;
            try {
                URL tmp = new URL(url);
                host = tmp.getHost();
            } catch (MalformedURLException e) {
                e.printStackTrace();
            }
            return host;
        }

        /** Hash by domain name. */
        private int getHashPartition(String urlString, String host) {
            int hashCode = urlString.hashCode();
            if (host != null)
                hashCode = host.hashCode();
            // make hosts wind up in different partitions on different runs
            hashCode ^= seed;

            int part = (hashCode & Integer.MAX_VALUE) % reduceNum;
            return part;
        }

        private ResultScanner getRS() throws IOException {
            ResultScanner rs = null;

            Scan scan = new Scan();
            scan.setFilter(getFilters());
            // if (topn > rsCache)
            // scan.setCaching(Long.valueOf(topn / 2).intValue());
            // else
            scan.setCaching(Long.valueOf(topn).intValue());

            try {
                rs = table.getScanner(scan);
            } catch (IOException e) {
                e.printStackTrace();
            }

            return rs;
        }

        private FilterList getFilters() throws IOException {
            int intervalThreshold = conf.getInt(Generator.GENERATOR_MIN_INTERVAL, -1);

            List<Filter> tmp = new ArrayList<Filter>();
            // ?? // check fetch schedule
            SingleColumnValueFilter columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"),
                    Bytes.toBytes("Fetchtime"), CompareOp.LESS_OR_EQUAL, Bytes.toBytes(generateTime));
            columnFilter.setFilterIfMissing(true);
            tmp.add(columnFilter);
            // generate?
            columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"), Bytes.toBytes(Nutch.GENERATE_TIME_KEY),
                    CompareOp.LESS_OR_EQUAL,
                    Bytes.toBytes(generateTime - conf.getLong(Generator.GENERATOR_DELAY, 24 * 3600 * 1000l)));
            tmp.add(columnFilter);
            // ?? // consider only entries with a
            if (intervalThreshold > 0) {
                // retry (or fetch) interval lower than threshold
                columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"), Bytes.toBytes("FetchInterval"),
                        CompareOp.LESS_OR_EQUAL, Bytes.toBytes(intervalThreshold));
                tmp.add(columnFilter);
            }

            // columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"),
            // Bytes.toBytes("Score"),
            // CompareOp.GREATER_OR_EQUAL, Bytes.toBytes(0f));
            // columnFilter.setFilterIfMissing(true);
            // tmp.add(columnFilter);

            if (hostn > 0) {
                // topn?
                Filter filter = new HostFilter(hostn);
                tmp.add(filter);
            }
            // ?
            Filter filter = new PageFilter(topn);
            tmp.add(filter);

            FilterList filters = new FilterList(tmp);
            return filters;
        }

        private void createValue(CrawlDatum datum, Result r) {
            NavigableMap<byte[], byte[]> map = r.getFamilyMap(Bytes.toBytes("cf1"));
            org.apache.hadoop.io.MapWritable metaData = new org.apache.hadoop.io.MapWritable();

            for (Iterator iterator = map.keySet().iterator(); iterator.hasNext();) {
                byte[] key = (byte[]) iterator.next();
                byte[] value = map.get(key);
                String skey = Bytes.toString(key);

                if ("url".equals(skey)) {
                    // nothing
                } else if ("Score".equals(skey)) {
                    if (value != null)
                        datum.setScore(Bytes.toFloat(value));
                } else if ("Status".equals(skey)) {
                    if (value != null)
                        datum.setStatus(value[0]);
                } else if ("Fetchtime".equals(skey)) {
                    if (value != null)
                        datum.setFetchTime(Bytes.toLong(value));
                } else if ("Retries".equals(skey)) {
                    if (value != null)
                        datum.setRetriesSinceFetch(value[0]);
                } else if ("FetchInterval".equals(skey)) {
                    if (value != null)
                        datum.setFetchInterval(Bytes.toInt(value));
                } else if ("Modifiedtime".equals(skey)) {
                    if (value != null)
                        datum.setModifiedTime(Bytes.toLong(value));
                } else if ("Signature".equals(skey)) {
                    if (value != null)
                        datum.setSignature(value);
                } else
                    metaData.put(new Text(key), new Text(value));
            }
            metaData.put(new Text("urlid"), new Text(r.getRow()));
            datum.setMetaData(metaData);
        }

        private Put generatedPut(byte[] url, CrawlDatum value) {
            Put put = createPut(url, value);
            // generate time
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes(Nutch.GENERATE_TIME_KEY), Bytes.toBytes(generateTime));

            return put;
        }

        private Put createPut(byte[] url, CrawlDatum value) {
            MapWritable meta = value.getMetaData();
            Text key = null;
            for (Entry<Writable, Writable> e : meta.entrySet()) {
                if ("urlid".equals(((Text) e.getKey()).toString())) {
                    key = (Text) e.getValue();
                    break;
                }
            }
            Put put = new Put(key.getBytes());

            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("url"), url);
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Score"), Bytes.toBytes(value.getScore()));
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Status"), new byte[] { value.getStatus() });
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Fetchtime"), Bytes.toBytes(value.getFetchTime()));
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Retries"), new byte[] { value.getRetriesSinceFetch() });
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("FetchInterval"), Bytes.toBytes(value.getFetchInterval()));
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Modifiedtime"), Bytes.toBytes(value.getModifiedTime()));
            if (value.getSignature() != null && value.getSignature().length != 0)
                put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Signature"), value.getSignature());

            for (Entry<Writable, Writable> e : meta.entrySet()) {
                if (!"urlid".equals(((Text) e.getKey()).toString()))
                    put.add(Bytes.toBytes("cf1"), Bytes.toBytes(e.getKey().toString()),
                            Bytes.toBytes(e.getValue().toString()));
            }

            return put;
        }
    }

    public GeneratorRedHbase() {
    }

    public GeneratorRedHbase(Configuration conf) {
        setConf(conf);
    }

    /**
     * Generate fetchlists in one or more segments. Whether to filter URLs or
     * not is read from the crawl.generate.filter property in the configuration
     * files. If the property is not found, the URLs are filtered. Same for the
     * normalisation.
     * 
     * @param dbDir
     *            Crawl database directory
     * @param segments
     *            Segments directory
     * @param numLists
     *            Number of reduce tasks
     * @param topN
     *            Number of top URLs to be selected
     * @param curTime
     *            Current time in milliseconds
     * 
     * @return Path to generated segment or null if no entries were selected
     * 
     * @throws IOException
     *             When an I/O error occurs
     */
    public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
            boolean norm, boolean force, int maxNumSegments, int tableDepth) throws IOException {
        GenerateInfos.topn = topN;
        GenerateInfos.hostn = getConf().getInt(Generator.GENERATOR_MAX_COUNT, -1);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("Generator: starting at " + sdf.format(start));
        LOG.info("Generator: Selecting best-scoring urls due for fetch.");
        LOG.info("Generator: filtering: " + filter);
        LOG.info("Generator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("Generator: topN: " + topN);
        }
        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        if (maxNumSegments == -1)
            maxNumSegments = 1;
        List<Path> generatedSegments = new ArrayList<Path>();
        int j = 0;// use for tablename
        String table = null;
        boolean isSmart = getConf().getBoolean("nutch.smart.is", false);
        int tableMax = getConf().getInt("generate.table.Max", 10);

        for (int i = 0; i < maxNumSegments; i++) {
            Path segment = null;

            if (isSmart) {
            } else {
                long segStart = System.currentTimeMillis();
                int cnt = 0;
                while (cnt == 0) {// ???
                    if (j++ == tableDepth)// depth?
                    {
                        if (generatedSegments.size() > 0)
                            return generatedSegments.toArray(new Path[generatedSegments.size()]);
                        else
                            return null;
                    }
                    // ??
                    segment = new Path(segments, Generator.generateSegmentName());
                    long begin = System.currentTimeMillis();
                    table = dbDir.getName() + (tableMax + 1 - j);
                    RunningJob r = generateJob(table, segment, numLists, topN - cnt, curTime, filter, norm, force);
                    Counter counter = r.getCounters().findCounter("Generator", "records");
                    cnt += counter.getValue();
                    LOG.info("Generator: " + segment + " records: " + cnt + " current table=" + table + " timeused="
                            + (System.currentTimeMillis() - begin) / 1000);
                }
                generatedSegments.add(segment);
                j--;

                GenerateInfo genInfo = GenerateInfos.getGenerateInfo();
                genInfo.start = segStart;
                genInfo.generate = cnt;
                genInfo.table = table;
                genInfo.end = System.currentTimeMillis();
                genInfo.endTime = sdf.format(genInfo.end);
            }
        }

        LOG.info(GenerateInfos.printString());
        long end = System.currentTimeMillis();
        LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

        if (generatedSegments.size() > 0)
            return generatedSegments.toArray(new Path[generatedSegments.size()]);
        else
            return null;
    }

    private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
            boolean filter, boolean norm, boolean force) throws IOException {
        LOG.info("Generator: segment=" + segment);

        JobConf job = new NutchJob(getConf());
        job.setJarByClass(GeneratorRedHbase.class);
        job.setJobName("generate: from " + table + " "
                + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis()));
        // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

        if (numLists == -1) {
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        // job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.set(GENERATL_TABLE, table);
        job.setInt(GENERATL_REDUCENUM, numLists);
        job.setInt("partition.url.seed", new Random().nextInt());

        job.setInputFormat(CodeInputFormat.class);
        job.setNumMapTasks(1);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(GenerateMark.class);
        job.setNumReduceTasks(numLists);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setOutputKeyComparatorClass(HashComparator.class);
        Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
        FileOutputFormat.setOutputPath(job, output);

        RunningJob r = null;
        try {
            r = JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }
        return r;
    }

    /**
     * Generate a fetchlist from the crawldb.
     */
    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new GeneratorRedHbase(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        long topN = 80000;
        for (int i = 0; i < args.length; i++) {
            if ("-topN".equals(args[i])) {
                topN = Long.parseLong(args[i + 1]);
                i++;
            }
        }

        try {
            Path[] segs = generate(new Path("/data/crawldb"), new Path("/data/segments"), 4, topN, 0, false, false,
                    false, 1, 1);
            if (segs == null)
                return -1;
        } catch (Exception e) {
            LOG.error("Generator: " + StringUtils.stringifyException(e));
            return -1;
        }
        return 0;
    }
}