net.peacesoft.nutch.crawl.RaovatPostDeleteDuplicates.java Source code

Introduction

Here is the source code for net.peacesoft.nutch.crawl.RaovatPostDeleteDuplicates.java
Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.peacesoft.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Date;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.indexer.solr.SolrUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

/**
 * Utility class for deleting duplicate documents from a solr index.
 *
 * The algorithm goes like follows:
 *
 * Preparation: <ol> <li>Query the solr server for the number of documents (say,
 * N)</li> <li>Partition N among M map tasks. For example, if we have two map
 * tasks the first map task will deal with solr documents from 0 - (N / 2 - 1)
 * and the second will deal with documents from (N / 2) to (N - 1).</li> </ol>
 *
 * MapReduce: <ul> <li>Map: Identity map where keys are digests and values are
 * {@link SolrRecord} instances(which contain id, boost and timestamp)</li>
 * <li>Reduce: After map, {@link SolrRecord}s with the same digest will be
 * grouped together. Now, of these documents with the same digests, delete all
 * of them except the one with the highest score (boost field). If two (or more)
 * documents have the same score, then the document with the latest timestamp is
 * kept. Again, every other is deleted from solr index. </li> </ul>
 *
 * Note that unlike {@link DeleteDuplicate}s we assume that two documents in a
 * solr index will never have the same URL. So this class only deals with
 * documents with <b>different</b> URLs but the same digest.
 */
public class RaovatPostDeleteDuplicates implements
        Reducer<Text, RaovatPostDeleteDuplicates.SolrRecord, Text, RaovatPostDeleteDuplicates.SolrRecord>, Tool {

    public static final Logger LOG = LoggerFactory.getLogger(RaovatPostDeleteDuplicates.class);
    private static final String SOLR_GET_ALL_QUERY = "*:*";
    private static final int NUM_MAX_DELETE_REQUEST = 1000;

    public static class SolrRecord implements Writable {

        //        private float boost;
        //        private long tstamp;
        private String id;

        public SolrRecord() {
        }

        public SolrRecord(RaovatPostDeleteDuplicates.SolrRecord old) {
            this.id = old.id;
            //            this.boost = old.boost;
            //            this.tstamp = old.tstamp;
        }

        //        public SolrRecord(String id, float boost, long tstamp) {
        //            this.id = id;
        //            this.boost = boost;
        //            this.tstamp = tstamp;
        //        }
        public String getId() {
            return id;
        }
        //
        //        public float getBoost() {
        //            return boost;
        //        }
        //
        //        public long getTstamp() {
        //            return tstamp;
        //        }

        public void readSolrDocument(SolrDocument doc) {
            id = (String) doc.getFieldValue(ReSolrConstants.ID_FIELD);
            //            boost = Float.parseFloat(doc.getFieldValue(ReSolrConstants.BOOST_FIELD).toString());

            //            Date buffer = (Date) doc.getFieldValue(ReSolrConstants.TIMESTAMP_FIELD);
            //            tstamp = buffer.getTime();
        }

        public void readFields(DataInput in) throws IOException {
            id = Text.readString(in);
            //            boost = in.readFloat();
            //            tstamp = in.readLong();
        }

        public void write(DataOutput out) throws IOException {
            Text.writeString(out, id);
            //            out.writeFloat(boost);
            //            out.writeLong(tstamp);
        }
    }

    public static class SolrInputSplit implements InputSplit {

        private int docBegin;
        private int numDocs;

        public SolrInputSplit() {
        }

        public SolrInputSplit(int docBegin, int numDocs) {
            this.docBegin = docBegin;
            this.numDocs = numDocs;
        }

        public int getDocBegin() {
            return docBegin;
        }

        public int getNumDocs() {
            return numDocs;
        }

        public long getLength() throws IOException {
            return numDocs;
        }

        public String[] getLocations() throws IOException {
            return new String[] {};
        }

        public void readFields(DataInput in) throws IOException {
            docBegin = in.readInt();
            numDocs = in.readInt();
        }

        public void write(DataOutput out) throws IOException {
            out.writeInt(docBegin);
            out.writeInt(numDocs);
        }
    }

    public static class SolrInputFormat implements InputFormat<Text, RaovatPostDeleteDuplicates.SolrRecord> {

        /**
         * Return each index as a split.
         */
        public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

            final SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
            solrQuery.setFields(ReSolrConstants.ID_FIELD);
            solrQuery.setRows(1);

            QueryResponse response;
            try {
                response = solr.query(solrQuery);
            } catch (final SolrServerException e) {
                throw new IOException(e);
            }

            int numResults = (int) response.getResults().getNumFound();
            int numDocsPerSplit = (numResults / numSplits);
            int currentDoc = 0;
            RaovatPostDeleteDuplicates.SolrInputSplit[] splits = new RaovatPostDeleteDuplicates.SolrInputSplit[numSplits];
            for (int i = 0; i < numSplits - 1; i++) {
                splits[i] = new RaovatPostDeleteDuplicates.SolrInputSplit(currentDoc, numDocsPerSplit);
                currentDoc += numDocsPerSplit;
            }
            splits[splits.length - 1] = new RaovatPostDeleteDuplicates.SolrInputSplit(currentDoc,
                    numResults - currentDoc);

            return splits;
        }

        public RecordReader<Text, RaovatPostDeleteDuplicates.SolrRecord> getRecordReader(final InputSplit split,
                final JobConf job, Reporter reporter) throws IOException {

            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
            RaovatPostDeleteDuplicates.SolrInputSplit solrSplit = (RaovatPostDeleteDuplicates.SolrInputSplit) split;
            final int numDocs = solrSplit.getNumDocs();

            SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
            solrQuery.setFields(ReSolrConstants.ID_FIELD, ReSolrConstants.BOOST_FIELD,
                    ReSolrConstants.TIMESTAMP_FIELD, ReSolrConstants.DIGEST_FIELD);
            solrQuery.setStart(solrSplit.getDocBegin());
            solrQuery.setRows(numDocs);

            QueryResponse response;
            try {
                response = solr.query(solrQuery);
            } catch (final SolrServerException e) {
                throw new IOException(e);
            }

            final SolrDocumentList solrDocs = response.getResults();

            return new RecordReader<Text, RaovatPostDeleteDuplicates.SolrRecord>() {
                private int currentDoc = 0;

                public void close() throws IOException {
                }

                public Text createKey() {
                    return new Text();
                }

                public RaovatPostDeleteDuplicates.SolrRecord createValue() {
                    return new RaovatPostDeleteDuplicates.SolrRecord();
                }

                public long getPos() throws IOException {
                    return currentDoc;
                }

                public float getProgress() throws IOException {
                    return currentDoc / (float) numDocs;
                }

                public boolean next(Text key, RaovatPostDeleteDuplicates.SolrRecord value) throws IOException {
                    if (currentDoc >= numDocs) {
                        return false;
                    }

                    SolrDocument doc = solrDocs.get(currentDoc);
                    String digest = (String) doc.getFieldValue(ReSolrConstants.SIGNATURE_FIELD);
                    key.set(digest);
                    value.readSolrDocument(doc);

                    currentDoc++;
                    return true;
                }
            };
        }
    }

    private Configuration conf;
    private SolrServer solr;
    private boolean noCommit = false;
    private int numDeletes = 0;
    private UpdateRequest updateRequest = new UpdateRequest();

    public Configuration getConf() {
        return conf;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public void configure(JobConf job) {
        try {
            solr = SolrUtils.getCommonsHttpSolrServer(job);
            noCommit = job.getBoolean("noCommit", false);
        } catch (MalformedURLException e) {
            throw new RuntimeException(e);
        }
    }

    public void close() throws IOException {
        try {
            if (numDeletes > 0) {
                LOG.info("RaovatPostDeleteDuplicates: deleting " + numDeletes + " duplicates");
                updateRequest.process(solr);

                if (!noCommit) {
                    solr.commit();
                }
            }
        } catch (SolrServerException e) {
            throw new IOException(e);
        }
    }

    public void reduce(Text key, Iterator<RaovatPostDeleteDuplicates.SolrRecord> values,
            OutputCollector<Text, RaovatPostDeleteDuplicates.SolrRecord> output, Reporter reporter)
            throws IOException {
        //Cai dau tien giu lai
        RaovatPostDeleteDuplicates.SolrRecord recordToKeep = new RaovatPostDeleteDuplicates.SolrRecord(
                values.next());
        //Cac cai sau xoa bo
        while (values.hasNext()) {
            RaovatPostDeleteDuplicates.SolrRecord solrRecord = values.next();
            updateRequest.deleteById(solrRecord.id);
            numDeletes++;
            //            reporter.incrCounter("RaovatPostDeleteDuplicates", "Deleted documents", 1);
            if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
                try {
                    LOG.info("RaovatPostDeleteDuplicates: deleting " + numDeletes + " duplicates");
                    updateRequest.process(solr);
                } catch (SolrServerException e) {
                    throw new IOException(e);
                }
                updateRequest = new UpdateRequest();
                numDeletes = 0;
            }
        }
    }

    public void dedup(String solrUrl) throws IOException {
        dedup(solrUrl, false);
    }

    public void dedup(String solrUrl, boolean noCommit) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("RaovatPostDeleteDuplicates: starting at " + sdf.format(start));
        LOG.info("RaovatPostDeleteDuplicates: Solr url: " + solrUrl);

        JobConf job = new NutchJob(getConf());

        job.set(ReSolrConstants.SERVER_URL, solrUrl);
        job.setBoolean("noCommit", noCommit);
        job.setInputFormat(RaovatPostDeleteDuplicates.SolrInputFormat.class);
        job.setOutputFormat(NullOutputFormat.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(RaovatPostDeleteDuplicates.SolrRecord.class);
        job.setMapperClass(IdentityMapper.class);
        job.setReducerClass(RaovatPostDeleteDuplicates.class);

        JobClient.runJob(job);

        long end = System.currentTimeMillis();
        LOG.info("RaovatPostDeleteDuplicates: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    }

    public int run(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.println("Usage: RaovatPostDeleteDuplicates <solr url> [-noCommit]");
            return 1;
        }

        boolean noCommit = false;
        if (args.length == 2 && args[1].equals("-noCommit")) {
            noCommit = true;
        }

        dedup(args[0], noCommit);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int result = ToolRunner.run(NutchConfiguration.create(), new RaovatPostDeleteDuplicates(), args);
        System.exit(result);
    }
}