org.hxx.hadoop.GeneratorHbase.java Source code

Introduction

Here is the source code for org.hxx.hadoop.GeneratorHbase.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.hxx.hadoop;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.NavigableMap;
import java.util.Random;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseConfiguration;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.client.HConnection;
import org.apache.hadoop.hbase.client.HConnectionManager;
import org.apache.hadoop.hbase.client.HTableInterface;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.ResultScanner;
import org.apache.hadoop.hbase.client.Scan;
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
import org.apache.hadoop.hbase.filter.Filter;
import org.apache.hadoop.hbase.filter.FilterList;
import org.apache.hadoop.hbase.filter.PageFilter;
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.Counters.Counter;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.crawl.URLPartitioner;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
// rLogging imports
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.hxx.hadoop.GeneratorHbase.TableTopInputFormat.TableKeyInputSplit;
import org.hxx.hadoop.stat.GenerateInfo;
import org.hxx.hadoop.stat.GenerateInfos;
import org.hxx.hbase.HostFilter;

/**
 * Generates a subset of a crawl db to fetch. This version allows to generate
 * fetchlists for several segments in one go. Unlike in the initial version
 * (OldGenerator), the IP resolution is done ONLY on the entries which have been
 * selected for fetching. The URLs are partitioned by IP, domain or host within
 * a segment. We can chose separately how to count the URLS i.e. by domain or
 * host to limit the entries.
 **/
/**
 * Generates a subset of a crawl db to fetch. This version allows to generate
 * fetchlists for several segments in one go. Unlike in the initial version
 * (OldGenerator), the IP resolution is done ONLY on the entries which have been
 * selected for fetching. The URLs are partitioned by IP, domain or host within
 * a segment. We can chose separately how to count the URLS i.e. by domain or
 * host to limit the entries.
 **/
public class GeneratorHbase extends Generator {
    public static final Logger LOG = LoggerFactory.getLogger(GeneratorHbase.class);
    static final String GENERATL_TABLE = "generate.table";
    static final String GENERATL_REDUCECNT = "generate.reduceCnt";
    static int tableCacheSize = 50000;
    static int HBASE_REGIONSERVER_LEASE_PERIOD = 600000;

    private static Put createGenerateTime(byte[] url, CrawlDatum value, long generateTime) {
        Put put = createPutByDatum(url, value);
        // generate time
        put.add(Bytes.toBytes("cf1"), Bytes.toBytes(Nutch.GENERATE_TIME_KEY), Bytes.toBytes(generateTime));

        return put;
    }

    private static void createDatum(CrawlDatum datum, Result r) {
        NavigableMap<byte[], byte[]> map = r.getFamilyMap(Bytes.toBytes("cf1"));
        org.apache.hadoop.io.MapWritable metaData = new org.apache.hadoop.io.MapWritable();

        for (Iterator iterator = map.keySet().iterator(); iterator.hasNext();) {
            byte[] key = (byte[]) iterator.next();
            byte[] value = map.get(key);
            String skey = Bytes.toString(key);
            if ("url".equals(skey)) {
                // nothing
            } else if ("Score".equals(skey)) {
                if (value != null)
                    datum.setScore(Bytes.toFloat(value));
            } else if ("Status".equals(skey)) {
                if (value != null)
                    datum.setStatus(value[0]);
            } else if ("Fetchtime".equals(skey)) {
                if (value != null)
                    datum.setFetchTime(Bytes.toLong(value));
            } else if ("Retries".equals(skey)) {
                if (value != null)
                    datum.setRetriesSinceFetch(value[0]);
            } else if ("FetchInterval".equals(skey)) {
                if (value != null)
                    datum.setFetchInterval(Bytes.toInt(value));
            } else if ("Modifiedtime".equals(skey)) {
                if (value != null)
                    datum.setModifiedTime(Bytes.toLong(value));
            } else if ("Signature".equals(skey)) {
                if (value != null)
                    datum.setSignature(value);
            } else if (Nutch.GENERATE_TIME_KEY.equals(skey)) {// mfang,2014/10/13
                if (value != null) {
                    metaData.put(new Text(key), new LongWritable(Bytes.toLong(value)));
                }
            } else
                metaData.put(new Text(key), new Text(value));
        }
        metaData.put(new Text("urlid"), new Text(r.getRow()));
        datum.setMetaData(metaData);
    }

    private static Put createPutByDatum(byte[] url, CrawlDatum value) {
        MapWritable meta = value.getMetaData();
        Text key = null;
        for (Entry<Writable, Writable> e : meta.entrySet()) {
            if ("urlid".equals(((Text) e.getKey()).toString())) {
                key = (Text) e.getValue();
                break;
            }
        }
        Put put = new Put(Bytes.toBytes(key.toString()));

        put.add(Bytes.toBytes("cf1"), Bytes.toBytes("url"), url);
        put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Score"), Bytes.toBytes(value.getScore()));
        put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Status"), new byte[] { value.getStatus() });
        put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Fetchtime"), Bytes.toBytes(value.getFetchTime()));
        put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Retries"), new byte[] { value.getRetriesSinceFetch() });
        put.add(Bytes.toBytes("cf1"), Bytes.toBytes("FetchInterval"), Bytes.toBytes(value.getFetchInterval()));
        put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Modifiedtime"), Bytes.toBytes(value.getModifiedTime()));
        if (value.getSignature() != null && value.getSignature().length != 0)
            put.add(Bytes.toBytes("cf1"), Bytes.toBytes("Signature"), value.getSignature());

        for (Entry<Writable, Writable> e : meta.entrySet()) {
            if (!"urlid".equals(((Text) e.getKey()).toString()))
                put.add(Bytes.toBytes("cf1"), Bytes.toBytes(e.getKey().toString()),
                        Bytes.toBytes(e.getValue().toString()));
        }

        return put;
    }

    public static class TableTopInputFormat implements InputFormat<Text, CrawlDatum> {
        public static class TableReader implements RecordReader<Text, CrawlDatum> {
            private JobConf job;
            private long current = 0;
            private long total = Long.MAX_VALUE;

            Reporter reporter;
            private HConnection connection;
            private HTableInterface table;
            private ResultScanner rs;

            public TableReader(JobConf jobConf, String tableName, FilterList filters, String start, String end,
                    Reporter reporter) {
                this.job = jobConf;
                this.reporter = reporter;

                HBaseConfiguration.merge(this.job, HBaseConfiguration.create(this.job));
                // this.job.setLong(HConstants.HBASE_REGIONSERVER_LEASE_PERIOD_KEY,
                // HBASE_REGIONSERVER_LEASE_PERIOD);
                tableCacheSize = job.getInt("commit.hbase.threshold", tableCacheSize);
                try {
                    connection = HConnectionManager.createConnection(this.job);
                    this.table = connection.getTable(tableName);
                } catch (IOException e) {
                    e.printStackTrace();
                }

                reporter.setStatus("startkey=" + start + " endKey=" + end);
                init(filters, start, end);
            }

            private void init(FilterList filters, String start, String end) {
                Scan scan = new Scan();
                scan.setFilter(filters);
                scan.setCaching(tableCacheSize);
                if (start != null && !start.isEmpty())
                    scan.setStartRow(Bytes.toBytes(start));
                if (end != null && !end.isEmpty())
                    scan.setStopRow(Bytes.toBytes(end));

                try {
                    rs = table.getScanner(scan);
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }

            //
            public boolean next(Text key, CrawlDatum value) throws IOException {
                if (rs == null) {
                    current = total;
                    return false;
                }
                for (Result r : rs) {
                    if (r == null || r.isEmpty()) {
                        current = total;
                        return false;
                    }
                    if (++current <= total) {
                        byte[] urlByte = r.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("url"));
                        key.set(urlByte);
                        createDatum(value, r);

                        return true;
                    }
                }

                return false;
            }

            public Text createKey() {
                return new Text();
            }

            public CrawlDatum createValue() {
                return new CrawlDatum();
            }

            public long getPos() throws IOException {
                return current;
            }

            public void close() throws IOException {
                if (rs != null)
                    rs.close();
                table.close();
                connection.close();
            }

            public float getProgress() throws IOException {
                return current / 50000000;
            }
        }

        protected static String[] getHostSplits() {
            String[] splits = new String[61];
            int i = 0;
            int len = 10;
            for (i = 0; i < len; i++) {
                splits[i] = String.valueOf(i);
            }

            char a = 'a';
            len += 22;
            for (; i < len; i++) {
                splits[i] = String.valueOf(a++);
            }

            a = 'a';
            String wwwPre = "www.";
            len += 26;
            for (; i < len; i++) {
                StringBuilder sb = new StringBuilder(wwwPre).append(a++);
                splits[i] = sb.toString();
            }
            splits[i++] = "x";
            splits[i++] = "y";
            splits[i++] = "z";
            return splits;
        }

        public static class TableKeyInputSplit extends org.apache.hadoop.mapreduce.InputSplit
                implements InputSplit {
            private String tableName = null;
            private String begin = "";
            private String end = "";

            public String getBegin() {
                return begin;
            }

            public void setBegin(String begin) {
                this.begin = begin;
            }

            public String getEnd() {
                return end;
            }

            public void setEnd(String end) {
                this.end = end;
            }

            public TableKeyInputSplit() {
            }

            public TableKeyInputSplit(String table, String begin, String end) {
                this.tableName = table;
                this.begin = begin;
                this.end = end;
            }

            public void write(DataOutput out) throws IOException {
                out.writeUTF(tableName);
                out.writeUTF(begin);
                out.writeUTF(end);
            }

            public void readFields(DataInput in) throws IOException {
                tableName = in.readUTF();
                begin = in.readUTF();
                end = in.readUTF();
            }

            public long getLength() throws IOException {
                return 1;
            }

            public String[] getLocations() throws IOException {
                return new String[] { tableName };
            }
        }

        private String table;

        public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
            table = job.get(GENERATL_TABLE);
            String[] hostSplits = getHostSplits();
            List<TableKeyInputSplit> list = new ArrayList<TableKeyInputSplit>();
            int divisor = 2;
            int mapCnt = hostSplits.length / divisor;
            for (int i = 0; i < mapCnt; i++) {
                TableKeyInputSplit split = null;
                if (i == 0)// last
                    split = new TableKeyInputSplit(table, "1", hostSplits[(i + 1) * divisor]);
                else if (i == mapCnt - 1)// last
                    split = new TableKeyInputSplit(table, hostSplits[i * divisor], "~");
                else
                    split = new TableKeyInputSplit(table, hostSplits[i * divisor], hostSplits[(i + 1) * divisor]);

                list.add(split);
            }

            return list.toArray(new TableKeyInputSplit[] {});
        }

        @Deprecated
        public InputSplit[] getSplitsBak(JobConf job, int numSplits) throws IOException {
            table = job.get(GENERATL_TABLE);
            int reduceCnt = job.getInt(GENERATL_REDUCECNT, 8);
            String[] hostSplits = getHostSplits();
            int len = Math.round(
                    Integer.valueOf(hostSplits.length).floatValue() / Integer.valueOf(reduceCnt).floatValue());
            List<TableKeyInputSplit> list = new ArrayList<TableKeyInputSplit>();
            for (int i = 0; i < reduceCnt; i++) {
                TableKeyInputSplit split = null;
                if (i == 0)
                    split = new TableKeyInputSplit(table, "", hostSplits[len - 1]);
                else if (i == reduceCnt - 1)
                    split = new TableKeyInputSplit(table, hostSplits[len * i - 1], "");
                else
                    split = new TableKeyInputSplit(table, hostSplits[len * i - 1], hostSplits[len * (i + 1) - 1]);

                list.add(split);
            }

            return list.toArray(new TableKeyInputSplit[] {});
        }

        public RecordReader<Text, CrawlDatum> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
                throws IOException {
            if (table == null)
                table = job.get("generate.table");

            int intervalThreshold = job.getInt(Generator.GENERATOR_MIN_INTERVAL, -1);
            long curTime = job.getLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
            int hostn = job.getInt(Generator.GENERATOR_MAX_COUNT, -1);
            long topn = job.getLong(Generator.GENERATOR_TOP_N, 0);

            List<Filter> tmp = new ArrayList<Filter>();

            // mfang 2014/09/29: isSmart=true generator
            // ???
            boolean isSmart = job.getBoolean("nutch.smart.is", false);

            if (!isSmart && !job.getBoolean("generate.test", true)) {
                // ?? // check fetch schedule
                SingleColumnValueFilter columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"),
                        Bytes.toBytes("Fetchtime"), CompareOp.LESS_OR_EQUAL, Bytes.toBytes(curTime));
                columnFilter.setFilterIfMissing(true);
                tmp.add(columnFilter);
                // generate?
                columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"),
                        Bytes.toBytes(Nutch.GENERATE_TIME_KEY), CompareOp.LESS_OR_EQUAL,
                        Bytes.toBytes(curTime - job.getLong(Generator.GENERATOR_DELAY, 24 * 3600 * 1000l)));
                tmp.add(columnFilter);
                // ?? // consider only entries with a
                if (intervalThreshold > 0) {
                    // retry (or fetch) interval lower than threshold
                    columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"), Bytes.toBytes("FetchInterval"),
                            CompareOp.LESS_OR_EQUAL, Bytes.toBytes(intervalThreshold));
                    tmp.add(columnFilter);
                }
                // ?
                columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"), Bytes.toBytes("Score"),
                        CompareOp.GREATER, Bytes.toBytes(0f));
                tmp.add(columnFilter);
            }
            // ??
            SingleColumnValueFilter columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"),
                    Bytes.toBytes("Status"), CompareOp.NOT_EQUAL, new byte[] { CrawlDatum.STATUS_DB_GONE });
            tmp.add(columnFilter);
            if (hostn > 0) {
                // topn?
                Filter filter = new HostFilter(hostn);
                tmp.add(filter);
            }
            if (topn > 0) {
                // ?
                Filter filter = new PageFilter(topn);
                tmp.add(filter);
            }
            FilterList filters = new FilterList(tmp);
            return new TableReader(job, table, filters, ((TableKeyInputSplit) split).getBegin(),
                    ((TableKeyInputSplit) split).getEnd(), reporter);
        }
    }

    public static class CodeInputFormat implements InputFormat<IntWritable, IntWritable> {
        public static class CodeReader implements RecordReader<IntWritable, IntWritable> {
            private int current = 0;
            private int reduceCnt = 1;

            public CodeReader(int reduceCnt) {
                super();
                this.reduceCnt = reduceCnt;
            }

            public boolean next(IntWritable key, IntWritable value) throws IOException {
                if (current == reduceCnt)
                    return false;

                key.set(current);
                value.set(current);
                current++;

                return true;
            }

            public IntWritable createKey() {
                return new IntWritable(0);
            }

            public IntWritable createValue() {
                return new IntWritable(0);
            }

            public long getPos() throws IOException {
                return current;
            }

            public void close() throws IOException {
            }

            public float getProgress() throws IOException {
                return current / reduceCnt;
            }
        }

        public static class CustomInputSplit extends org.apache.hadoop.mapreduce.InputSplit implements InputSplit {
            private int reduceCnt = 1;

            public CustomInputSplit() {
                super();
            }

            public CustomInputSplit(int reduceCnt) {
                super();
                this.reduceCnt = reduceCnt;
            }

            public void write(DataOutput out) throws IOException {
                out.writeInt(reduceCnt);
            }

            public void readFields(DataInput in) throws IOException {
                reduceCnt = in.readInt();
            }

            public long getLength() throws IOException {
                return reduceCnt;// reduce num?
            }

            public String[] getLocations() throws IOException {
                return new String[] {};
            }
        }

        public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
            return new CustomInputSplit[] {
                    new CustomInputSplit(job.getInt(GeneratorHbase.GENERATL_REDUCECNT, 1)) };
        }

        public RecordReader<IntWritable, IntWritable> getRecordReader(InputSplit split, JobConf job,
                Reporter reporter) throws IOException {
            return new CodeReader(job.getInt(GeneratorHbase.GENERATL_REDUCECNT, 1));
        }
    }

    /** Selects entries due for fetch. */
    public static class GenerateMark implements Mapper<Text, CrawlDatum, Text, CrawlDatum>,
            Partitioner<Text, CrawlDatum>, Reducer<IntWritable, IntWritable, Text, CrawlDatum> {
        private boolean filter;
        private URLFilters filters;
        JobConf conf;
        private Map hostCnt = new HashMap();
        private long topn = tableCacheSize;
        private int hostn = -1;
        private long generateTime;
        private int reduceCnt = 1;
        private int seed;
        private Partitioner<Text, Writable> partitioner = new URLPartitioner();

        private HConnection connection;
        private HTableInterface table;
        private long cnt = 0;

        private long curTimeMillis = 0;
        // private long lastGenTimeMillis = 0;
        private boolean isSmart = false;

        // static long GetZeroTimeSeconds() {
        // Calendar c = Calendar.getInstance();
        // try {
        // c.setTime(new SimpleDateFormat("yyyy-MM-dd")
        // .parse("2014-01-01"));
        // } catch (ParseException e) {
        // e.printStackTrace();
        // }
        // return c.getTimeInMillis() / 1000L;
        // }

        // static final long ZeroTimeMillis = GetZeroTimeSeconds() * 1000L; //
        // ??;

        public void configure(JobConf job) {
            filter = job.getBoolean(GENERATOR_FILTER, true);
            if (filter)
                filters = new URLFilters(job);

            reduceCnt = job.getInt(GeneratorHbase.GENERATL_REDUCECNT, 1);
            topn = job.getLong(Generator.GENERATOR_TOP_N, tableCacheSize);
            hostn = job.getInt(Generator.GENERATOR_MAX_COUNT, -1);
            generateTime = job.getLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
            seed = job.getInt("partition.url.seed", 0);
            partitioner.configure(job);
            tableCacheSize = job.getInt("commit.hbase.threshold", tableCacheSize);

            String tableName = job.get(GENERATL_TABLE);
            HBaseConfiguration.merge(job, HBaseConfiguration.create(job));
            // job.setLong(HConstants.HBASE_REGIONSERVER_LEASE_PERIOD_KEY,
            // HBASE_REGIONSERVER_LEASE_PERIOD);
            LOG.info(HConstants.HBASE_REGIONSERVER_LEASE_PERIOD_KEY + "==============="
                    + job.getLong(HConstants.HBASE_REGIONSERVER_LEASE_PERIOD_KEY, 0));
            try {
                connection = HConnectionManager.createConnection(job);
                table = connection.getTable(tableName);
                table.setAutoFlush(false, true);
                table.setWriteBufferSize(300 * tableCacheSize);
            } catch (IOException e) {
                e.printStackTrace();
            }
            conf = job;

            isSmart = job.getBoolean("nutch.smart.is", false);
            if (isSmart) {// ??
                // 
                // ??
                curTimeMillis = job.getLong(Nutch.GENERATE_TIME_KEY, System.currentTimeMillis());
                // - ZeroTimeMillis;
                // lastGenTimeMillis = job.getLong(Nutch.LAST_GENERATE_TIME_KEY,
                // curTimeMillis - 5 * 60 * 1000L) - ZeroTimeMillis;
                // ?[lastGenTime,curTime]?URL
            }
        }

        public void close() {
            try {
                table.flushCommits();
                table.close();
                connection.close();
            } catch (IOException e) {
                e.printStackTrace();
            }

            LOG.info("total=" + cnt + "; hosts=" + hostCnt.size());
        }

        public void map(Text key, CrawlDatum value, OutputCollector<Text, CrawlDatum> output, Reporter reporter)
                throws IOException {
            if (filter) {
                try {
                    if (filters.filter(key.toString()) == null)
                        return;
                } catch (URLFilterException e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Couldn't filter url: " + key + " (" + e.getMessage() + ")");
                    }
                }
            }
            LOG.info("**** isSmart :" + isSmart);
            // mfang 2014/09/29 ? Generator
            if (isSmart) {
                long lastGenTimeMillis = 0;
                Writable temp = value.getMetaData().get(new Text(Nutch.GENERATE_TIME_KEY));
                if (temp != null) {
                    lastGenTimeMillis = ((LongWritable) temp).get();
                }
                // else {
                // LOG.info("**** GENERATE_TIME is null");
                // }
                LOG.info("**** URL :" + key.toString());
                LOG.info("**** getFetchInterval :" + value.getFetchInterval());
                LOG.info("**** lastGenTimeMillis :" + lastGenTimeMillis);
                LOG.info("**** curTimeMillis :" + curTimeMillis);

                //  X1//JOB?T1 //?T2
                // (T1-T0)%X1 < (T2-T0)%X1 ?
                long x1 = value.getFetchInterval() * 1000;
                if ((curTimeMillis - lastGenTimeMillis) < x1)
                    // || lastGenTimeMillis % x1 >= curTimeMillis % x1)
                    return;
            }

            if (!isSmart && !hostFilte(key.toString()))
                return;

            Put put = createGenerateTime(Bytes.toBytes(key.toString()), value, generateTime);
            table.put(put);
            if (++cnt % tableCacheSize == 0) {
                table.flushCommits();
                String region = ((TableKeyInputSplit) reporter.getInputSplit()).getBegin() + "-"
                        + ((TableKeyInputSplit) reporter.getInputSplit()).getEnd();
                reporter.setStatus(region + " commit:" + cnt);
            }
            output.collect(key, value);// partition
            reporter.incrCounter("Generator", "records", 1);
        }

        private boolean hostFilte(String url) {
            String host = getHost(url);
            if (host != null && !host.isEmpty()) {
                if (hostCnt.containsKey(host)) {
                    AtomicLong cnt = (AtomicLong) hostCnt.get(host);
                    cnt.incrementAndGet();
                } else {
                    hostCnt.put(host, new AtomicLong(1));
                }
                return true;
            }
            return false;
        }

        @Override
        @Deprecated
        public void reduce(IntWritable key, Iterator<IntWritable> values, OutputCollector<Text, CrawlDatum> output,
                Reporter reporter) throws IOException {
            LOG.info("generatorHbase:load url from partition=" + key.get());

            int part = key.get();
            long partTopn = topn / reduceCnt;// part topn

            ResultScanner rs = getRS();
            if (rs == null) {
                return;
            }

            for (Result r : rs) {
                if (cnt == partTopn) {
                    return;
                }
                if (r == null || r.isEmpty()) {
                    return;
                }

                byte[] urlByte = r.getValue(Bytes.toBytes("cf1"), Bytes.toBytes("url"));
                if (!filteUrl(Bytes.toString(urlByte), part)) {
                    continue;
                }

                Text urlKey = new Text(urlByte);
                CrawlDatum value = new CrawlDatum();
                createDatum(value, r);

                // if (!smartFilter(value))
                // continue;

                Put put = createGenerateTime(Bytes.toBytes(urlKey.toString()), value, generateTime);
                table.put(put);
                if (++cnt % 10000 == 0) {
                    table.flushCommits();
                }

                output.collect(urlKey, value);// OldOutputCollector
                reporter.incrCounter("Generator", "records", 1);
            }

            rs.close();
        }

        private boolean filteUrl(String url, int part) {
            if (filter) {
                // don't generate URLs that don't pass URLFilters
                try {
                    if (filters.filter(url) == null)
                        return false;
                } catch (URLFilterException e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage() + ")");
                    }
                    return false;
                }
            }

            String host = getHost(url);
            if (part != getHashPartition(url, host))// belong to the partition?
                return false;

            if (host != null) {
                if (hostCnt.containsKey(host)) {
                    AtomicLong cnt = (AtomicLong) hostCnt.get(host);
                    cnt.incrementAndGet();

                    if (hostn != -1) {
                        if (cnt.get() <= hostn)
                            return true;
                        else {
                            cnt.decrementAndGet();
                            return false;
                        }
                    }
                    return true;
                } else {
                    hostCnt.put(host, new AtomicLong(1));
                    return true;
                }
            }

            return false;
        }

        private String getHost(String url) {
            String host = null;
            try {
                URL tmp = new URL(url);
                host = tmp.getHost();
            } catch (MalformedURLException e) {
                // e.printStackTrace();
            }
            return host;
        }

        /** Hash by domain name. */
        private int getHashPartition(String urlString, String host) {
            int hashCode = urlString.hashCode();
            if (host != null)
                hashCode = host.hashCode();
            // make hosts wind up in different partitions on different runs
            hashCode ^= seed;

            int part = (hashCode & Integer.MAX_VALUE) % reduceCnt;
            return part;
        }

        private ResultScanner getRS() throws IOException {
            ResultScanner rs = null;

            Scan scan = new Scan();
            scan.setFilter(getFilters());
            scan.setCaching(Long.valueOf(topn).intValue());

            try {
                rs = table.getScanner(scan);
            } catch (IOException e) {
                e.printStackTrace();
            }

            return rs;
        }

        private FilterList getFilters() throws IOException {
            int intervalThreshold = conf.getInt(Generator.GENERATOR_MIN_INTERVAL, -1);

            List<Filter> tmp = new ArrayList<Filter>();
            // ?? // check fetch schedule
            SingleColumnValueFilter columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"),
                    Bytes.toBytes("Fetchtime"), CompareOp.LESS_OR_EQUAL, Bytes.toBytes(generateTime));
            columnFilter.setFilterIfMissing(true);
            tmp.add(columnFilter);
            // generate?
            columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"), Bytes.toBytes(Nutch.GENERATE_TIME_KEY),
                    CompareOp.LESS_OR_EQUAL,
                    Bytes.toBytes(generateTime - conf.getLong(Generator.GENERATOR_DELAY, 24 * 3600 * 1000l)));
            tmp.add(columnFilter);
            // ?? // consider only entries with a
            if (intervalThreshold > 0) {
                // retry (or fetch) interval lower than threshold
                columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"), Bytes.toBytes("FetchInterval"),
                        CompareOp.LESS_OR_EQUAL, Bytes.toBytes(intervalThreshold));
                tmp.add(columnFilter);
            }

            // columnFilter = new SingleColumnValueFilter(Bytes.toBytes("cf1"),
            // Bytes.toBytes("Score"),
            // CompareOp.GREATER_OR_EQUAL, Bytes.toBytes(0f));
            // columnFilter.setFilterIfMissing(true);
            // tmp.add(columnFilter);

            if (hostn > 0) { // topn?
                Filter filter = new HostFilter(hostn);
                tmp.add(filter);
            }
            // ?
            Filter filter = new PageFilter(topn);
            tmp.add(filter);

            FilterList filters = new FilterList(tmp);
            return filters;
        }

        @Override
        public int getPartition(Text key, CrawlDatum value, int numPartitions) {
            return partitioner.getPartition(key, value, numPartitions);
        }
    }

    public GeneratorHbase() {
    }

    public GeneratorHbase(Configuration conf) {
        setConf(conf);
    }

    public Path generateAll(int tableNum, Path segments, int reduceCnt, boolean filter, boolean norm,
            boolean force) {
        return generateAll(tableNum, segments, 0, reduceCnt, filter, norm, force);
    }

    public Path generateAll(int tableNum, Path segments, long topN, int reduceCnt, boolean filter, boolean norm,
            boolean force) {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("Generator: from table=" + tableNum + " starting at " + sdf.format(start));
        LOG.info("Generator: filtering:=" + filter + "; Generator: normalizing=" + norm);

        Path segment = new Path(segments, Generator.generateSegmentName());
        String table = "crawldb" + tableNum;
        long cnt = 0;
        try {
            RunningJob r = generateJob(table, segment, topN, reduceCnt, filter, norm, force);
            Counter counter = r.getCounters().findCounter("Generator", "records");
            cnt = counter.getValue();
            if (r.isSuccessful()) {
                // LOG.info(Nutch.GEN_JOB_SUCCESS + "=1;");
            } else {
                // LOG.info(Nutch.GEN_JOB_FAIL + "=1;");
            }
            LOG.info("Generator: " + segment + " records: " + cnt + " current table=" + table + " timeused="
                    + (System.currentTimeMillis() - start) / 1000 + "s");
        } catch (Throwable e) {
            removePath(segment);
            LOG.error("generateAll:", e);
        }

        int less = getConf().getInt("generator.less", 10000);
        if (cnt == 0) {
            removePath(segment);
            return null;
        } else if (cnt <= less) {// too less : && cnt <= 10000
            removePath(segment);
            return null;
        }

        long end = System.currentTimeMillis();
        // LOG.info(Nutch.GEN_TIME + "=" + (end - start) + ";");
        // have records
        GenerateInfos.topn = topN;
        GenerateInfos.hostn = getConf().getInt(Generator.GENERATOR_MAX_COUNT, -1);
        GenerateInfo genInfo = GenerateInfos.getGenerateInfo();
        genInfo.start = start;
        genInfo.generate = cnt;
        genInfo.table = table;
        genInfo.end = end;
        genInfo.endTime = sdf.format(genInfo.end);
        LOG.info(GenerateInfos.printString());
        LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

        return segment;
    }

    public boolean removePath(Path segment) {
        try {
            FileSystem fs = FileSystem.get(getConf());
            if (!fs.exists(segment))
                return false;
            return fs.delete(segment, true);
        } catch (Throwable e) {
            LOG.error("generator removePath=" + segment, e);
        }
        return false;
    }

    private RunningJob generateJob(String table, Path segment, long topN, int reduceCnt, boolean filter,
            boolean norm, boolean force) throws IOException {
        LOG.info("Generator: from table=" + table + " segment=" + segment);

        JobConf job = new NutchJob(getConf());
        // job.setJarByClass(GeneratorHbase.class);
        job.setJobName("generate:" + table + " "
                + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment);

        if (reduceCnt == -1) {
            reduceCnt = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) {
            LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
            reduceCnt = 1;
        }
        // job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.set(GENERATL_TABLE, table);
        job.setInt(GENERATL_REDUCECNT, reduceCnt);
        job.setInt("partition.url.seed", new Random().nextInt());

        job.setInputFormat(TableTopInputFormat.class);// ?
        job.setMapperClass(GenerateMark.class);// generate?

        job.setPartitionerClass(GenerateMark.class);
        job.setNumReduceTasks(reduceCnt);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setOutputKeyComparatorClass(HashComparator.class);
        Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
        FileOutputFormat.setOutputPath(job, output);

        RunningJob r = JobClient.runJob(job);
        return r;
    }

    public Path[] generate(Path segments, int reduceCnt, long topN, boolean filter, boolean norm, boolean force,
            int segCount, int tableDepth) throws IOException {
        GenerateInfos.topn = topN;
        GenerateInfos.hostn = getConf().getInt(Generator.GENERATOR_MAX_COUNT, -1);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("Generator: starting at " + sdf.format(start));
        LOG.info("Generator: Selecting best-scoring urls due for fetch.");
        LOG.info("Generator: filtering: " + filter);
        LOG.info("Generator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("Generator: topN: " + topN);
        }
        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        if (segCount == -1)
            segCount = 1;
        List<Path> generatedSegments = new ArrayList<Path>();
        int j = 0;// use for tablename
        String table = null;
        boolean isSmart = getConf().getBoolean("nutch.smart.is", false);
        int tableMax = getConf().getInt("generate.table.Max", 10);

        for (int i = 0; i < segCount; i++) {
            Path segment = null;

            if (isSmart) {
            } else {
                long segStart = System.currentTimeMillis();
                int cnt = 0;
                while (cnt == 0) {// ???
                    if (j++ == tableDepth)// depth?
                    {
                        if (generatedSegments.size() > 0)
                            return generatedSegments.toArray(new Path[generatedSegments.size()]);
                        else
                            return null;
                    }
                    // ??
                    segment = new Path(segments, Generator.generateSegmentName());
                    long begin = System.currentTimeMillis();
                    table = "crawldb" + (tableMax + 1 - j);
                    RunningJob r = generateJob(table, segment, reduceCnt, topN - cnt, filter, norm, force);
                    Counter counter = r.getCounters().findCounter("Generator", "records");
                    cnt += counter.getValue();
                    LOG.info("Generator: " + segment + " records: " + cnt + " current table=" + table + " timeused="
                            + (System.currentTimeMillis() - begin) / 1000);
                }
                generatedSegments.add(segment);
                j--;

                GenerateInfo genInfo = GenerateInfos.getGenerateInfo();
                genInfo.start = segStart;
                genInfo.generate = cnt;
                genInfo.table = table;
                genInfo.end = System.currentTimeMillis();
                genInfo.endTime = sdf.format(genInfo.end);
            }
        }
        long end = System.currentTimeMillis();
        LOG.info(GenerateInfos.printString());
        LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

        if (generatedSegments.size() > 0)
            return generatedSegments.toArray(new Path[generatedSegments.size()]);
        else
            return null;
    }

    private RunningJob generateJob(String table, Path segment, int reduceCnt, long topN, boolean filter,
            boolean norm, boolean force) throws IOException {
        LOG.info("Generator: segment=" + segment);

        JobConf job = new NutchJob(getConf());
        // job.setJarByClass(GeneratorHbase.class);
        job.setJobName("generate:" + table + " "
                + (new SimpleDateFormat("HH:mm:ss")).format(System.currentTimeMillis()) + " path=" + segment);
        // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

        if (reduceCnt == -1) {
            reduceCnt = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && reduceCnt != 1) {
            LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
            reduceCnt = 1;
        }
        // job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.set(GENERATL_TABLE, table);
        job.setInt(GENERATL_REDUCECNT, reduceCnt);
        job.setInt("partition.url.seed", new Random().nextInt());

        job.setInputFormat(CodeInputFormat.class);
        job.setNumMapTasks(1);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(GenerateMark.class);
        job.setNumReduceTasks(reduceCnt);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        job.setOutputKeyComparatorClass(HashComparator.class);
        Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
        FileOutputFormat.setOutputPath(job, output);

        RunningJob r = JobClient.runJob(job);
        return r;
    }

    /**
     * Generate a fetchlist from the crawldb.
     */
    public static void main(String args[]) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new GeneratorHbase(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        long topN = 80000;
        for (int i = 0; i < args.length; i++) {
            if ("-topN".equals(args[i])) {
                topN = Long.parseLong(args[i + 1]);
                i++;
            }
        }

        try {
            Path[] segs = generate(new Path("/data/segments"), 4, topN, false, false, false, 1, 1);
            if (segs == null)
                return -1;
        } catch (Exception e) {
            LOG.error("Generator: " + StringUtils.stringifyException(e));
            return -1;
        }
        return 0;
    }
}