net.peacesoft.nutch.crawl.RaovatPostSignature.java Source code

Java tutorial

Introduction

Here is the source code for net.peacesoft.nutch.crawl.RaovatPostSignature.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.peacesoft.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.logging.Level;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.indexer.solr.SolrUtils;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;

/**
 * Utility class for deleting duplicate documents from a solr index.
 *
 * The algorithm goes like follows:
 *
 * Preparation: <ol> <li>Query the solr server for the number of documents (say,
 * N)</li> <li>Partition N among M map tasks. For example, if we have two map
 * tasks the first map task will deal with solr documents from 0 - (N / 2 - 1)
 * and the second will deal with documents from (N / 2) to (N - 1).</li> </ol>
 *
 * MapReduce: <ul> <li>Map: Identity map where keys are digests and values are
 * {@link SolrRecord} instances(which contain id, boost and timestamp)</li>
 * <li>Reduce: After map, {@link SolrRecord}s with the same digest will be
 * grouped together. Now, of these documents with the same digests, delete all
 * of them except the one with the highest score (boost field). If two (or more)
 * documents have the same score, then the document with the latest timestamp is
 * kept. Again, every other is deleted from solr index. </li> </ul>
 *
 * Note that unlike {@link DeleteDuplicate}s we assume that two documents in a
 * solr index will never have the same URL. So this class only deals with
 * documents with <b>different</b> URLs but the same digest.
 */
public class RaovatPostSignature
        implements Reducer<Text, RaovatPostSignature.SolrRecord, Text, RaovatPostSignature.SolrRecord>, Tool {

    public static final Logger LOG = LoggerFactory.getLogger(RaovatPostSignature.class);
    private static final String SOLR_GET_ALL_QUERY = "*:*";
    private static final int NUM_MAX_DELETE_REQUEST = 1000;

    public static class SolrRecord implements Writable {

        private String id;
        private SolrDocument doc;

        public SolrRecord() {
        }

        public SolrRecord(RaovatPostSignature.SolrRecord old) {
            this.id = old.id;
            this.doc = old.getDocument();
        }

        public SolrRecord(String id, float boost, long tstamp) {
            this.id = id;
        }

        public String getId() {
            return id;
        }

        public SolrDocument getDocument() {
            return doc;
        }

        public void readSolrDocument(SolrDocument doc_) {
            id = (String) doc_.getFieldValue(ReSolrConstants.ID_FIELD);
            doc = doc_;
        }

        public void readFields(DataInput in) throws IOException {
            id = Text.readString(in);
        }

        public void write(DataOutput out) throws IOException {
            Text.writeString(out, id);
        }
    }

    public static class SolrInputSplit implements InputSplit {

        private int docBegin;
        private int numDocs;

        public SolrInputSplit() {
        }

        public SolrInputSplit(int docBegin, int numDocs) {
            this.docBegin = docBegin;
            this.numDocs = numDocs;
        }

        public int getDocBegin() {
            return docBegin;
        }

        public int getNumDocs() {
            return numDocs;
        }

        public long getLength() throws IOException {
            return numDocs;
        }

        public String[] getLocations() throws IOException {
            return new String[] {};
        }

        public void readFields(DataInput in) throws IOException {
            docBegin = in.readInt();
            numDocs = in.readInt();
        }

        public void write(DataOutput out) throws IOException {
            out.writeInt(docBegin);
            out.writeInt(numDocs);
        }
    }

    public static class SolrInputFormat implements InputFormat<Text, RaovatPostSignature.SolrRecord> {

        /**
         * Return each index as a split.
         */
        public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

            final SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
            solrQuery.setFields(ReSolrConstants.ID_FIELD);
            solrQuery.setRows(1);

            QueryResponse response;
            try {
                response = solr.query(solrQuery);
            } catch (final SolrServerException e) {
                throw new IOException(e);
            }

            int numResults = (int) response.getResults().getNumFound();
            int numDocsPerSplit = (numResults / numSplits);
            int currentDoc = 0;
            RaovatPostSignature.SolrInputSplit[] splits = new RaovatPostSignature.SolrInputSplit[numSplits];
            for (int i = 0; i < numSplits - 1; i++) {
                splits[i] = new RaovatPostSignature.SolrInputSplit(currentDoc, numDocsPerSplit);
                currentDoc += numDocsPerSplit;
            }
            splits[splits.length - 1] = new RaovatPostSignature.SolrInputSplit(currentDoc, numResults - currentDoc);

            return splits;
        }

        public RecordReader<Text, RaovatPostSignature.SolrRecord> getRecordReader(final InputSplit split,
                final JobConf job, Reporter reporter) throws IOException {

            SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);
            RaovatPostSignature.SolrInputSplit solrSplit = (RaovatPostSignature.SolrInputSplit) split;
            final int numDocs = solrSplit.getNumDocs();

            SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
            //            solrQuery.setFields(ReSolrConstants.ID_FIELD);
            solrQuery.setStart(solrSplit.getDocBegin());
            solrQuery.setRows(numDocs);

            QueryResponse response;
            try {
                response = solr.query(solrQuery);
            } catch (final SolrServerException e) {
                throw new IOException(e);
            }

            final SolrDocumentList solrDocs = response.getResults();

            return new RecordReader<Text, RaovatPostSignature.SolrRecord>() {
                private int currentDoc = 0;

                public void close() throws IOException {
                }

                public Text createKey() {
                    return new Text();
                }

                public RaovatPostSignature.SolrRecord createValue() {
                    return new RaovatPostSignature.SolrRecord();
                }

                public long getPos() throws IOException {
                    return currentDoc;
                }

                public float getProgress() throws IOException {
                    return currentDoc / (float) numDocs;
                }

                public boolean next(Text key, RaovatPostSignature.SolrRecord value) throws IOException {
                    if (currentDoc >= numDocs) {
                        return false;
                    }
                    SolrDocument doc = solrDocs.get(currentDoc);

                    String digest = (String) doc.getFieldValue("myid");
                    key.set(digest);
                    value.readSolrDocument(doc);

                    currentDoc++;
                    return true;
                }
            };
        }
    }

    private Configuration conf;
    private SolrServer solr;
    private boolean noCommit = false;
    private int numDeletes = 0;
    private UpdateRequest updateRequest = new UpdateRequest();

    public Configuration getConf() {
        return conf;
    }

    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    public void configure(JobConf job) {
        try {
            solr = SolrUtils.getCommonsHttpSolrServer(job);
            noCommit = job.getBoolean("noCommit", false);
        } catch (MalformedURLException e) {
            throw new RuntimeException(e);
        }
    }

    public void close() throws IOException {
        try {
            if (numDeletes > 0) {
                LOG.info("RaovatPostSignature: signed " + numDeletes + " key");
                updateRequest.process(solr);
            }
            if (!noCommit) {
                solr.commit();
            }
        } catch (SolrServerException e) {
            throw new IOException(e);
        }
    }

    public void reduce(Text key, Iterator<RaovatPostSignature.SolrRecord> values,
            OutputCollector<Text, RaovatPostSignature.SolrRecord> output, Reporter reporter) throws IOException {
        while (values.hasNext()) {
            RaovatPostSignature.SolrRecord solrRecord = values.next();

            try {
                SolrQuery solrQuery = new SolrQuery("id:" + key);

                QueryResponse response;
                try {
                    response = solr.query(solrQuery);
                } catch (final SolrServerException e) {
                    throw new IOException(e);
                }

                final SolrDocumentList solrDocs = response.getResults();
                for (SolrDocument solrDocument : solrDocs) {
                    LOG.info(solrDocument.toString());
                    updateRequest.add(ClientUtils.toSolrInputDocument(solrDocument));
                }
                LOG.info("RaovatPostSignature sign the key to signature field id " + key + " value "
                        + solrDocs.size());
            } catch (Exception ex) {
                LOG.error("RaovatPostSignature reduce error: " + ex.toString(), ex);
            }
            if (numDeletes >= NUM_MAX_DELETE_REQUEST) {
                try {
                    LOG.info("RaovatPostSignature: signed " + numDeletes + " key");
                    updateRequest.process(solr);
                } catch (SolrServerException e) {
                    throw new IOException(e);
                }
                updateRequest = new UpdateRequest();
                numDeletes = 0;
            }
        }
    }

    public void dedup(String solrUrl) throws IOException {
        dedup(solrUrl, false);
    }

    public void dedup(String solrUrl, boolean noCommit) throws IOException {
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("RaovatPostSignature: starting at " + sdf.format(start));
        LOG.info("RaovatPostSignature: Solr url: " + solrUrl);

        JobConf job = new NutchJob(getConf());

        job.set(ReSolrConstants.SERVER_URL, solrUrl);

        //        job.setBoolean("noCommit", noCommit);
        //        job.setInputFormat(RaovatPostSignature.SolrInputFormat.class);
        //        job.setOutputFormat(NullOutputFormat.class);
        //        job.setMapOutputKeyClass(Text.class);
        //        job.setMapOutputValueClass(RaovatPostSignature.SolrRecord.class);
        //        job.setMapperClass(IdentityMapper.class);
        //        job.setReducerClass(RaovatPostSignature.class);
        //
        //        JobClient.runJob(job);

        solr = SolrUtils.getCommonsHttpSolrServer(job);

        SolrQuery solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
        solrQuery.setFields(ReSolrConstants.ID_FIELD);
        solrQuery.setRows(1);

        QueryResponse response;
        try {
            response = solr.query(solrQuery);
        } catch (final SolrServerException e) {
            throw new IOException(e);
        }

        int numResults = (int) response.getResults().getNumFound();
        LOG.info("Total items:" + numResults);
        int s = 1;
        int value = 200;
        int page = numResults / value + 1;

        updateRequest = new UpdateRequest();

        for (int i = 0; i < page; i++) {
            solrQuery = new SolrQuery(SOLR_GET_ALL_QUERY);
            solrQuery.setStart(s);
            solrQuery.setRows(value);

            try {
                response = solr.query(solrQuery);
            } catch (final SolrServerException e) {
                throw new IOException(e);
            }

            SolrDocumentList solrDocs = response.getResults();
            for (SolrDocument solrDocument : solrDocs) {
                updateRequest.add(ClientUtils.toSolrInputDocument(solrDocument));
            }

            s += value;
        }

        try {
            solr.commit();
        } catch (SolrServerException ex) {
        }

        long end = System.currentTimeMillis();
        LOG.info("RaovatPostSignature: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    }

    public int run(String[] args) throws IOException {
        if (args.length < 1) {
            System.err.println("Usage: RaovatPostSignature <solr url> [-noCommit]");
            return 1;
        }

        boolean noCommit = false;
        if (args.length == 2 && args[1].equals("-noCommit")) {
            noCommit = true;
        }

        dedup(args[0], noCommit);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        int result = ToolRunner.run(NutchConfiguration.create(), new RaovatPostSignature(), args);
        System.exit(result);
    }
}