org.warcbase.index.IndexerReducer.java Source code

Introduction

Here is the source code for org.warcbase.index.IndexerReducer.java
Source

/*
 * Warcbase: an open-source platform for managing web archives
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.warcbase.index;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.UpdateResponse;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.HdfsDirectoryFactory;

import uk.bl.wa.apache.solr.hadoop.Solate;
import uk.bl.wa.hadoop.indexer.WritableSolrRecord;
import uk.bl.wa.solr.SolrRecord;

public class IndexerReducer extends MapReduceBase implements Reducer<IntWritable, WritableSolrRecord, Text, Text> {
    public static final String HDFS_OUTPUT_PATH = "IndexerReducer.HDFSOutputPath";

    private static final Log LOG = LogFactory.getLog(IndexerReducer.class);
    private static final int SUBMISSION_PAUSE_MINS = 5;
    private static final String SHARD_PREFIX = "shard";

    private SolrServer solrServer;
    private int batchSize = 1000;
    private List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();
    private int numberOfSequentialFails = 0;

    private FileSystem fs;
    private Path solrHome;
    private Path outputDir;

    static enum MyCounters {
        NUM_RECORDS, NUM_ERRORS, NUM_DROPPED_RECORDS
    }

    @Override
    public void configure(JobConf job) {
        LOG.info("Configuring reducer...");

        // Initialize the embedded server.
        try {
            job.setBoolean("fs.hdfs.impl.disable.cache", true);
            fs = FileSystem.get(job);
            solrHome = Solate.findSolrConfig(job, IndexerRunner.solrHomeZipName);
            LOG.info("Found solrHomeDir " + solrHome);
        } catch (IOException e) {
            e.printStackTrace();
            LOG.error("FAILED in reducer configuration: " + e);
        }
        outputDir = new Path(job.get(HDFS_OUTPUT_PATH));
        LOG.info("HDFS index output path: " + outputDir);

        LOG.info("Initialization complete.");
    }

    private void initEmbeddedServer(int slice) throws IOException {
        if (solrHome == null) {
            throw new IOException("Unable to find solr home setting");
        }

        Path outputShardDir = new Path(fs.getHomeDirectory() + "/" + outputDir, SHARD_PREFIX + slice);

        LOG.info("Creating embedded Solr server with solrHomeDir: " + solrHome + ", fs: " + fs
                + ", outputShardDir: " + outputShardDir);

        Path solrDataDir = new Path(outputShardDir, "data");
        if (!fs.exists(solrDataDir) && !fs.mkdirs(solrDataDir)) {
            throw new IOException("Unable to create " + solrDataDir);
        }

        String dataDirStr = solrDataDir.toUri().toString();
        LOG.info("Attempting to set data dir to: " + dataDirStr);

        System.setProperty("solr.data.dir", dataDirStr);
        System.setProperty("solr.home", solrHome.toString());
        System.setProperty("solr.solr.home", solrHome.toString());
        System.setProperty("solr.hdfs.home", outputDir.toString());
        System.setProperty("solr.directoryFactory", HdfsDirectoryFactory.class.getName());
        System.setProperty("solr.lock.type", "hdfs");
        System.setProperty("solr.hdfs.nrtcachingdirectory", "false");
        System.setProperty("solr.hdfs.blockcache.enabled", "true");
        System.setProperty("solr.hdfs.blockcache.write.enabled", "false");
        System.setProperty("solr.autoCommit.maxTime", "600000");
        System.setProperty("solr.autoSoftCommit.maxTime", "-1");

        LOG.info("Loading the container...");
        CoreContainer container = new CoreContainer();
        container.load();
        for (String s : container.getAllCoreNames()) {
            LOG.warn("Got core name: " + s);
        }
        String coreName = "";
        if (container.getCoreNames().size() > 0) {
            coreName = container.getCoreNames().iterator().next();
        }

        LOG.error("Now firing up the server...");
        solrServer = new EmbeddedSolrServer(container, coreName);
        LOG.error("Server started successfully!");
    }

    @Override
    public void reduce(IntWritable key, Iterator<WritableSolrRecord> values, OutputCollector<Text, Text> output,
            Reporter reporter) throws IOException {
        SolrRecord solr;

        // Get the shard number, but counting from 1 instead of 0:
        int shard = key.get() + 1;

        // For indexing into HDFS, set up a new server per key:
        initEmbeddedServer(shard);

        // Go through the documents for this shard:
        long cnt = 0;
        while (values.hasNext()) {
            solr = values.next().getSolrRecord();
            cnt++;

            docs.add(solr.getSolrDocument());
            // Have we exceeded the batchSize?
            checkSubmission(docs, batchSize, reporter);

            // Occasionally update application-level status:
            if ((cnt % 1000) == 0) {
                reporter.setStatus(SHARD_PREFIX + shard + ": processed " + cnt + ", dropped "
                        + reporter.getCounter(MyCounters.NUM_DROPPED_RECORDS).getValue());
            }
        }

        try {
            // If we have at least one document unsubmitted, make sure we submit it.
            checkSubmission(docs, 1, reporter);

            // If we are indexing to HDFS, shut the shard down:
            // Commit, and block until the changes have been flushed.
            solrServer.commit(true, false);
            solrServer.shutdown();
        } catch (Exception e) {
            LOG.error("ERROR on commit: " + e);
            e.printStackTrace();
        }
    }

    @Override
    public void close() {
    }

    private void checkSubmission(List<SolrInputDocument> docs, int limit, Reporter reporter) {
        if (docs.size() > 0 && docs.size() >= limit) {
            try {
                // Inform that there is progress (still-alive):
                reporter.progress();
                UpdateResponse response = solrServer.add(docs);
                LOG.info("Submitted " + docs.size() + " docs [" + response.getStatus() + "]");
                reporter.incrCounter(MyCounters.NUM_RECORDS, docs.size());
                docs.clear();
                numberOfSequentialFails = 0;
            } catch (Exception e) {
                // Count up repeated fails:
                numberOfSequentialFails++;

                // If there have been a lot of fails, drop the records (we have seen some 
                // "Invalid UTF-8 character 0xfffe at char" so this avoids bad data blocking job completion)
                if (this.numberOfSequentialFails >= 3) {
                    LOG.error("Submission has repeatedly failed - assuming bad data and dropping these "
                            + docs.size() + " records.");
                    reporter.incrCounter(MyCounters.NUM_DROPPED_RECORDS, docs.size());
                    docs.clear();
                }

                // SOLR-5719 possibly hitting us here; CloudSolrServer.RouteException
                LOG.error("Sleeping for " + SUBMISSION_PAUSE_MINS + " minute(s): " + e.getMessage(), e);
                // Also add a report for this condition.
                reporter.incrCounter(MyCounters.NUM_ERRORS, 1);
                try {
                    Thread.sleep(1000 * 60 * SUBMISSION_PAUSE_MINS);
                } catch (InterruptedException ex) {
                    LOG.warn("Sleep between Solr submissions was interrupted!");
                }
            }
        }
    }
}