com.TCG.Nutch_DNS.Injector.java Source code

Introduction

Here is the source code for com.TCG.Nutch_DNS.Injector.java
Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.TCG.Nutch_DNS;

import java.io.*;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.*;

// Commons Logging imports

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.fs.*;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;

import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import com.TCG.util.*;

public class Injector extends NutchTool implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(Injector.class);

    /** Normalize  injected urls to host. */
    public static class InjectMapper implements Mapper<Text, CrawlDatum, Text, CrawlDatum> {

        public void configure(JobConf job) {
        }

        public void close() {
        }

        public void map(Text key, CrawlDatum value, OutputCollector<Text, CrawlDatum> output, Reporter reporter)
                throws IOException {

            URL url = new URL(key.toString()); // value is line of text
            String tmp = url.getHost();
            Text host = new Text(tmp);

            output.collect(host, value);
        }
    }

    //hostDb
    public static class ExitHostReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {

        private long curTime;
        private JobConf jobConf;
        private int interval;

        @Override
        public void configure(JobConf job) {
            // TODO Auto-generated method stub
            this.jobConf = job;
            curTime = job.getLong("injector.current.time", System.currentTimeMillis());
            interval = jobConf.getInt("db.fetch.interval.default", 2592000);
        }

        @Override
        public void close() throws IOException {
            // TODO Auto-generated method stub
        }

        private CrawlDatum datum = new CrawlDatum();

        @Override
        public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output,
                Reporter report) throws IOException {
            // TODO Auto-generated method stub
            float sum = 0;
            while (values.hasNext()) {
                CrawlDatum val = values.next();
                sum += val.getScore();
            }
            datum.setScore(sum);
            datum.setFetchTime(curTime);
            datum.setFetchInterval(interval);
            datum.setStatus(CrawlDatum.STATUS_INJECTED);//StatusINJECTED
        }

    }

    //?HostDb
    public static class NotExitHostReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {

        private long curTime;
        private JobConf jobConf;
        private int interval;

        @Override
        public void configure(JobConf job) {
            // TODO Auto-generated method stub
            this.jobConf = job;
            curTime = job.getLong("injector.current.time", System.currentTimeMillis());
            interval = jobConf.getInt("db.fetch.interval.default", 2592000);
        }

        @Override
        public void close() throws IOException {
            // TODO Auto-generated method stub
        }

        private CrawlDatum datum = new CrawlDatum();

        @Override
        public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output,
                Reporter report) throws IOException {
            // TODO Auto-generated method stub
            float sum = 0;
            while (values.hasNext()) {
                CrawlDatum val = values.next();
                sum += val.getScore();
            }
            datum.setScore(sum);
            datum.setFetchTime(curTime);
            datum.setFetchInterval(interval);
            datum.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);//StatusUNFETCHED
        }

    }

    /** Combine multiple new entries for a host. */
    public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        private int interval;
        private float scoreInjected;
        private boolean overwrite = false;
        private boolean update = false;

        public void configure(JobConf job) {
            interval = job.getInt("db.fetch.interval.default", 2592000);
            scoreInjected = job.getFloat("db.score.injected", 1.0f);
            overwrite = job.getBoolean("db.injector.overwrite", false);
            update = job.getBoolean("db.injector.update", false);
            LOG.info("Injector: overwrite: " + overwrite);
            LOG.info("Injector: update: " + update);
        }

        public void close() {
        }

        private CrawlDatum old = new CrawlDatum();
        private CrawlDatum injected = new CrawlDatum();

        public void reduce(Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output,
                Reporter reporter) throws IOException {

            boolean oldSet = false;
            boolean injectedSet = false;

            while (values.hasNext()) {
                CrawlDatum val = values.next();
                if (val.getStatus() == CrawlDatum.STATUS_INJECTED) {//HostDb
                    injected.set(val);
                    injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED);
                    injectedSet = true;
                } else {//?HostDb?
                    old.set(val);
                    oldSet = true;
                }

            }

            CrawlDatum res = null;

            // Old default behaviour
            if (injectedSet && !oldSet) {
                res = injected;
            } else {
                res = old;
            }
            if (injectedSet && oldSet) {
                reporter.getCounter("injector", "urls_merged").increment(1);
            }
            /**
             * Whether to overwrite, ignore or update existing records
             * 
             * @see https://issues.apache.org/jira/browse/NUTCH-1405
             */
            // Injected record already exists and update but not overwrite
            if (injectedSet && oldSet && update && !overwrite) {
                res = old;
                old.putAllMetaData(injected);
                old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore());
                old.setFetchInterval(injected.getFetchInterval() != interval ? injected.getFetchInterval()
                        : old.getFetchInterval());
            }

            // Injected record already exists and overwrite
            if (injectedSet && oldSet && overwrite) {
                res = injected;
            }

            output.collect(key, res);
        }
    }

    public Injector() {
    }

    public Injector(Configuration conf) {
        setConf(conf);
    }

    public void inject(Path hostDb, Path crawlDb) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: starting at " + sdf.format(start));
            LOG.info("Injector: hostDb: " + hostDb);
            LOG.info("Injector: carwlDb: " + crawlDb);
        }

        Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

        // map text input file to a <url,CrawlDatum> file
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Converting injected host to host db entries.");
        }

        FileSystem fs = FileSystem.get(getConf());
        // determine if the crawldb already exists
        boolean dbExists = fs.exists(hostDb);

        JobConf sortJob = new NutchJob(getConf());
        sortJob.setJobName("inject " + hostDb);
        FileInputFormat.addInputPath(sortJob, crawlDb);
        sortJob.setMapperClass(InjectMapper.class);

        FileOutputFormat.setOutputPath(sortJob, tempDir);
        if (dbExists) {
            sortJob.setOutputFormat(SequenceFileOutputFormat.class);
            //HostReducer,host
            sortJob.setReducerClass(ExitHostReducer.class);
        } else {
            sortJob.setOutputFormat(MapFileOutputFormat.class);

            //HostReducer,host
            sortJob.setReducerClass(NotExitHostReducer.class);

            sortJob.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);
        }
        sortJob.setOutputKeyClass(Text.class);
        sortJob.setOutputValueClass(CrawlDatum.class);
        sortJob.setLong("injector.current.time", System.currentTimeMillis());

        RunningJob mapJob = null;
        try {
            mapJob = JobClient.runJob(sortJob);
        } catch (IOException e) {
            fs.delete(tempDir, true);
            throw e;
        }

        if (dbExists) {

            // merge with existing host db
            if (LOG.isInfoEnabled()) {
                LOG.info("Injector: Merging injected hostDb into old hostDb.");
            }
            JobConf mergeJob = HostDb.createJob(getConf(), hostDb);
            FileInputFormat.addInputPath(mergeJob, tempDir);
            //HostDb.createJobReducer:HostDbReducer
            mergeJob.setReducerClass(InjectReducer.class);
            try {
                RunningJob merge = JobClient.runJob(mergeJob);
            } catch (IOException e) {
                fs.delete(tempDir, true);
                throw e;
            }
            HostDb.install(mergeJob, hostDb);
        } else {
            HostDb.install(sortJob, hostDb);
        }

        // clean up
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
    }

    public static void main(String[] args) throws Exception {
        int res = ToolRunner.run(NutchConfiguration.create(), new Injector(), args);
        System.exit(res);
    }

    public int run(String[] args) throws Exception {
        if (args.length < 2) {
            System.err.println("Usage: Injector <hostdb> <crawldb>");
            return -1;
        }
        try {
            inject(new Path(args[0]), new Path(args[1]));
            return 0;
        } catch (Exception e) {
            LOG.error("Injector: " + StringUtils.stringifyException(e));
            return -1;
        }
    }

    @Override
    /**
     * Used by the Nutch REST service
     */
    public Map<String, Object> run(Map<String, String> args, String hostId) throws Exception {
        if (args.size() < 1) {
            throw new IllegalArgumentException("Required arguments <url_dir>");
        }
        Map<String, Object> results = new HashMap<String, Object>();
        String RESULT = "result";
        String hostdb = hostId + "/hostdb";
        String crawldb = args.get("crawldb");

        inject(new Path(hostdb), new Path(crawldb));
        results.put(RESULT, Integer.toString(0));
        return results;

    }

}