org.apache.solr.hadoop.SolrRecordWriter.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.solr.hadoop.SolrRecordWriter.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.hadoop;

import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import java.util.Set;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.TaskID;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.CoreContainer;
import org.apache.solr.core.CoreDescriptor;
import org.apache.solr.core.SolrCore;
import org.apache.solr.core.SolrResourceLoader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * Instantiate a record writer that will build a Solr index.
 * 
 * A zip file containing the solr config and additional libraries is expected to
 * be passed via the distributed cache. The incoming written records are
 * converted via the specified document converter, and written to the index in
 * batches. When the job is done, the close copies the index to the destination
 * output file system. <h2>Configuration Parameters</h2>
 * <ul>
 * <li>solr.record.writer.batch.size - the number of documents in a batch that
 * is sent to the indexer.</li>
 * <li>mapred.task.id - To build the unique temporary index directory file name.
 * </li>
 * <li>solr.output.format.setup - {@link SolrOutputFormat.SETUP_OK} The path to
 * the configuration zip file.</li>
 * <li> {@link SolrOutputFormat.zipName} - The file name of the configuration
 * file.</li>
 * <li>solr.document.converter.class -
 * {@link SolrDocumentConverter.CONVERTER_NAME_KEY} the class used to convert
 * the {@link #write} key, values into a {@link SolrInputDocument}. Set via
 * {@link SolrDocumentConverter}.
 * </ul>
 */
class SolrRecordWriter<K, V> extends RecordWriter<K, V> {

    private static final Logger LOG = LoggerFactory.getLogger(SolrRecordWriter.class);

    public final static List<String> allowedConfigDirectories = new ArrayList<String>(
            Arrays.asList(new String[] { "conf", "lib" }));

    public final static Set<String> requiredConfigDirectories = new HashSet<String>();

    static {
        requiredConfigDirectories.add("conf");
    }

    /**
     * Return the list of directories names that may be included in the
     * configuration data passed to the tasks.
     * 
     * @return an UnmodifiableList of directory names
     */
    public static List<String> getAllowedConfigDirectories() {
        return Collections.unmodifiableList(allowedConfigDirectories);
    }

    /**
     * check if the passed in directory is required to be present in the
     * configuration data set.
     * 
     * @param directory The directory to check
     * @return true if the directory is required.
     */
    public static boolean isRequiredConfigDirectory(final String directory) {
        return requiredConfigDirectories.contains(directory);
    }

    /** The path that the final index will be written to */

    /** The location in a local temporary directory that the index is built in. */

    //  /**
    //   * If true, create a zip file of the completed index in the final storage
    //   * location A .zip will be appended to the final output name if it is not
    //   * already present.
    //   */
    //  private boolean outputZipFile = false;

    private final HeartBeater heartBeater;
    private final BatchWriter batchWriter;
    private final List<SolrInputDocument> batch;
    private final int batchSize;
    private long numDocsWritten = 0;
    private long nextLogTime = System.currentTimeMillis();

    private static HashMap<TaskID, Reducer<?, ?, ?, ?>.Context> contextMap = new HashMap<TaskID, Reducer<?, ?, ?, ?>.Context>();

    public SolrRecordWriter(TaskAttemptContext context, Path outputShardDir, int batchSize) {
        this.batchSize = batchSize;
        this.batch = new ArrayList(batchSize);
        Configuration conf = context.getConfiguration();

        // setLogLevel("org.apache.solr.core", "WARN");
        // setLogLevel("org.apache.solr.update", "WARN");

        heartBeater = new HeartBeater(context);
        try {
            heartBeater.needHeartBeat();

            Path solrHomeDir = SolrRecordWriter.findSolrConfig(conf);
            FileSystem fs = outputShardDir.getFileSystem(conf);
            EmbeddedSolrServer solr = createEmbeddedSolrServer(solrHomeDir, fs, outputShardDir);
            batchWriter = new BatchWriter(solr, batchSize, context.getTaskAttemptID().getTaskID(),
                    SolrOutputFormat.getSolrWriterThreadCount(conf), SolrOutputFormat.getSolrWriterQueueSize(conf));

        } catch (Exception e) {
            throw new IllegalStateException(String.format("Failed to initialize record writer for %s, %s",
                    context.getJobName(), conf.get("mapred.task.id")), e);
        } finally {
            heartBeater.cancelHeartBeat();
        }
    }

    public static EmbeddedSolrServer createEmbeddedSolrServer(Path solrHomeDir, FileSystem fs, Path outputShardDir)
            throws IOException {

        if (solrHomeDir == null) {
            throw new IOException("Unable to find solr home setting");
        }
        LOG.info("Creating embedded Solr server with solrHomeDir: " + solrHomeDir + ", fs: " + fs
                + ", outputShardDir: " + outputShardDir);

        Properties props = new Properties();
        // FIXME note this is odd (no scheme) given Solr doesn't currently
        // support uris (just abs/relative path)
        Path solrDataDir = new Path(outputShardDir, "data");
        if (!fs.exists(solrDataDir) && !fs.mkdirs(solrDataDir)) {
            throw new IOException("Unable to create " + solrDataDir);
        }

        String dataDirStr = solrDataDir.toUri().toString();
        props.setProperty("solr.data.dir", dataDirStr);
        props.setProperty("solr.home", solrHomeDir.toString());

        SolrResourceLoader loader = new SolrResourceLoader(solrHomeDir.toString(), null, props);

        LOG.info(String.format(
                "Constructed instance information solr.home %s (%s), instance dir %s, conf dir %s, writing index to solr.data.dir %s, with permdir %s",
                solrHomeDir, solrHomeDir.toUri(), loader.getInstanceDir(), loader.getConfigDir(), dataDirStr,
                outputShardDir));

        CoreContainer container = new CoreContainer(loader);
        container.load();
        CoreDescriptor descr = new CoreDescriptor(container, "core1", solrHomeDir.toString());

        descr.setDataDir(dataDirStr);
        descr.setCoreProperties(props);
        SolrCore core = container.create(descr);
        container.register(core, false);

        System.setProperty("solr.hdfs.nrtcachingdirectory", "false");
        System.setProperty("solr.hdfs.blockcache.enabled", "false");
        System.setProperty("solr.autoCommit.maxTime", "-1");
        System.setProperty("solr.autoSoftCommit.maxTime", "-1");
        EmbeddedSolrServer solr = new EmbeddedSolrServer(container, "core1");
        return solr;
    }

    public static void incrementCounter(TaskID taskId, String groupName, String counterName, long incr) {
        Reducer<?, ?, ?, ?>.Context context = contextMap.get(taskId);
        if (context != null) {
            context.getCounter(groupName, counterName).increment(incr);
        }
    }

    public static void incrementCounter(TaskID taskId, Enum counterName, long incr) {
        Reducer<?, ?, ?, ?>.Context context = contextMap.get(taskId);
        if (context != null) {
            context.getCounter(counterName).increment(incr);
        }
    }

    public static void addReducerContext(Reducer<?, ?, ?, ?>.Context context) {
        TaskID taskID = context.getTaskAttemptID().getTaskID();
        contextMap.put(taskID, context);
    }

    public static Path findSolrConfig(Configuration conf) throws IOException {
        Path solrHome = null;
        // FIXME when mrunit supports the new cache apis
        //URI[] localArchives = context.getCacheArchives();
        Path[] localArchives = DistributedCache.getLocalCacheArchives(conf);
        if (localArchives.length == 0) {
            throw new IOException(String.format("No local cache archives, where is %s:%s",
                    SolrOutputFormat.getSetupOk(), SolrOutputFormat.getZipName(conf)));
        }
        for (Path unpackedDir : localArchives) {
            // Only logged if debugging
            if (LOG.isDebugEnabled()) {
                LOG.debug(String.format("Examining unpack directory %s for %s", unpackedDir,
                        SolrOutputFormat.getZipName(conf)));

                ProcessBuilder lsCmd = new ProcessBuilder(
                        new String[] { "/bin/ls", "-lR", unpackedDir.toString() });
                lsCmd.redirectErrorStream();
                Process ls = lsCmd.start();
                byte[] buf = new byte[16 * 1024];
                InputStream all = ls.getInputStream();
                try {
                    int count;
                    while ((count = all.read(buf)) >= 0) {
                        System.err.write(buf, 0, count);
                    }
                } catch (IOException ignore) {
                } finally {
                    all.close();
                }
                String exitValue;
                try {
                    exitValue = String.valueOf(ls.waitFor());
                } catch (InterruptedException e) {
                    exitValue = "interrupted";
                }
                System.err.format("Exit value of 'ls -lR' is %s%n", exitValue);
            }
            if (unpackedDir.getName().equals(SolrOutputFormat.getZipName(conf))) {
                LOG.info("Using this unpacked directory as solr home: {}", unpackedDir);
                solrHome = unpackedDir;
                break;
            }
        }
        return solrHome;
    }

    /**
     * Write a record. This method accumulates records in to a batch, and when
     * {@link #batchSize} items are present flushes it to the indexer. The writes
     * can take a substantial amount of time, depending on {@link #batchSize}. If
     * there is heavy disk contention the writes may take more than the 600 second
     * default timeout.
     */
    @Override
    public void write(K key, V value) throws IOException {
        heartBeater.needHeartBeat();
        try {
            try {
                SolrInputDocumentWritable sidw = (SolrInputDocumentWritable) value;
                batch.add(sidw.getSolrInputDocument());
                if (batch.size() >= batchSize) {
                    batchWriter.queueBatch(batch);
                    numDocsWritten += batch.size();
                    if (System.currentTimeMillis() >= nextLogTime) {
                        LOG.info("docsWritten: {}", numDocsWritten);
                        nextLogTime += 10000;
                    }
                    batch.clear();
                }
            } catch (SolrServerException e) {
                throw new IOException(e);
            }
        } finally {
            heartBeater.cancelHeartBeat();
        }

    }

    @Override
    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
        if (context != null) {
            heartBeater.setProgress(context);
        }
        try {
            heartBeater.needHeartBeat();
            if (batch.size() > 0) {
                batchWriter.queueBatch(batch);
                numDocsWritten += batch.size();
                batch.clear();
            }
            LOG.info("docsWritten: {}", numDocsWritten);
            batchWriter.close(context);
            //      if (outputZipFile) {
            //        context.setStatus("Writing Zip");
            //        packZipFile(); // Written to the perm location
            //      } else {
            //        context.setStatus("Copying Index");
            //        fs.completeLocalOutput(perm, temp); // copy to dfs
            //      }
        } catch (Exception e) {
            if (e instanceof IOException) {
                throw (IOException) e;
            }
            throw new IOException(e);
        } finally {
            heartBeater.cancelHeartBeat();
            heartBeater.close();
            //      File tempFile = new File(temp.toString());
            //      if (tempFile.exists()) {
            //        FileUtils.forceDelete(new File(temp.toString()));
            //      }
        }

        context.setStatus("Done");
    }

    //  private void packZipFile() throws IOException {
    //    FSDataOutputStream out = null;
    //    ZipOutputStream zos = null;
    //    int zipCount = 0;
    //    LOG.info("Packing zip file for " + perm);
    //    try {
    //      out = fs.create(perm, false);
    //      zos = new ZipOutputStream(out);
    //
    //      String name = perm.getName().replaceAll(".zip$", "");
    //      LOG.info("adding index directory" + temp);
    //      zipCount = zipDirectory(conf, zos, name, temp.toString(), temp);
    //      /**
    //      for (String configDir : allowedConfigDirectories) {
    //        if (!isRequiredConfigDirectory(configDir)) {
    //          continue;
    //        }
    //        final Path confPath = new Path(solrHome, configDir);
    //        LOG.info("adding configdirectory" + confPath);
    //
    //        zipCount += zipDirectory(conf, zos, name, solrHome.toString(), confPath);
    //      }
    //      **/
    //    } catch (Throwable ohFoo) {
    //      LOG.error("packZipFile exception", ohFoo);
    //      if (ohFoo instanceof RuntimeException) {
    //        throw (RuntimeException) ohFoo;
    //      }
    //      if (ohFoo instanceof IOException) {
    //        throw (IOException) ohFoo;
    //      }
    //      throw new IOException(ohFoo);
    //
    //    } finally {
    //      if (zos != null) {
    //        if (zipCount == 0) { // If no entries were written, only close out, as
    //                             // the zip will throw an error
    //          LOG.error("No entries written to zip file " + perm);
    //          fs.delete(perm, false);
    //          // out.close();
    //        } else {
    //          LOG.info(String.format("Wrote %d items to %s for %s", zipCount, perm,
    //              temp));
    //          zos.close();
    //        }
    //      }
    //    }
    //  }
    //
    //  /**
    //   * Write a file to a zip output stream, removing leading path name components
    //   * from the actual file name when creating the zip file entry.
    //   * 
    //   * The entry placed in the zip file is <code>baseName</code>/
    //   * <code>relativePath</code>, where <code>relativePath</code> is constructed
    //   * by removing a leading <code>root</code> from the path for
    //   * <code>itemToZip</code>.
    //   * 
    //   * If <code>itemToZip</code> is an empty directory, it is ignored. If
    //   * <code>itemToZip</code> is a directory, the contents of the directory are
    //   * added recursively.
    //   * 
    //   * @param zos The zip output stream
    //   * @param baseName The base name to use for the file name entry in the zip
    //   *        file
    //   * @param root The path to remove from <code>itemToZip</code> to make a
    //   *        relative path name
    //   * @param itemToZip The path to the file to be added to the zip file
    //   * @return the number of entries added
    //   * @throws IOException
    //   */
    //  static public int zipDirectory(final Configuration conf,
    //      final ZipOutputStream zos, final String baseName, final String root,
    //      final Path itemToZip) throws IOException {
    //    LOG
    //        .info(String
    //            .format("zipDirectory: %s %s %s", baseName, root, itemToZip));
    //    LocalFileSystem localFs = FileSystem.getLocal(conf);
    //    int count = 0;
    //
    //    final FileStatus itemStatus = localFs.getFileStatus(itemToZip);
    //    if (itemStatus.isDirectory()) {
    //      final FileStatus[] statai = localFs.listStatus(itemToZip);
    //
    //      // Add a directory entry to the zip file
    //      final String zipDirName = relativePathForZipEntry(itemToZip.toUri()
    //          .getPath(), baseName, root);
    //      final ZipEntry dirZipEntry = new ZipEntry(zipDirName
    //          + Path.SEPARATOR_CHAR);
    //      LOG.info(String.format("Adding directory %s to zip", zipDirName));
    //      zos.putNextEntry(dirZipEntry);
    //      zos.closeEntry();
    //      count++;
    //
    //      if (statai == null || statai.length == 0) {
    //        LOG.info(String.format("Skipping empty directory %s", itemToZip));
    //        return count;
    //      }
    //      for (FileStatus status : statai) {
    //        count += zipDirectory(conf, zos, baseName, root, status.getPath());
    //      }
    //      LOG.info(String.format("Wrote %d entries for directory %s", count,
    //          itemToZip));
    //      return count;
    //    }
    //
    //    final String inZipPath = relativePathForZipEntry(itemToZip.toUri()
    //        .getPath(), baseName, root);
    //
    //    if (inZipPath.length() == 0) {
    //      LOG.warn(String.format("Skipping empty zip file path for %s (%s %s)",
    //          itemToZip, root, baseName));
    //      return 0;
    //    }
    //
    //    // Take empty files in case the place holder is needed
    //    FSDataInputStream in = null;
    //    try {
    //      in = localFs.open(itemToZip);
    //      final ZipEntry ze = new ZipEntry(inZipPath);
    //      ze.setTime(itemStatus.getModificationTime());
    //      // Comments confuse looking at the zip file
    //      // ze.setComment(itemToZip.toString());
    //      zos.putNextEntry(ze);
    //
    //      IOUtils.copyBytes(in, zos, conf, false);
    //      zos.closeEntry();
    //      LOG.info(String.format("Wrote %d entries for file %s", count, itemToZip));
    //      return 1;
    //    } finally {
    //      in.close();
    //    }
    //
    //  }
    //
    //  static String relativePathForZipEntry(final String rawPath,
    //      final String baseName, final String root) {
    //    String relativePath = rawPath.replaceFirst(Pattern.quote(root.toString()),
    //        "");
    //    LOG.info(String.format("RawPath %s, baseName %s, root %s, first %s",
    //        rawPath, baseName, root, relativePath));
    //
    //    if (relativePath.startsWith(Path.SEPARATOR)) {
    //      relativePath = relativePath.substring(1);
    //    }
    //    LOG.info(String.format(
    //        "RawPath %s, baseName %s, root %s, post leading slash %s", rawPath,
    //        baseName, root, relativePath));
    //    if (relativePath.isEmpty()) {
    //      LOG.warn(String.format(
    //          "No data after root (%s) removal from raw path %s", root, rawPath));
    //      return baseName;
    //    }
    //    // Construct the path that will be written to the zip file, including
    //    // removing any leading '/' characters
    //    String inZipPath = baseName + Path.SEPARATOR_CHAR + relativePath;
    //
    //    LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 1 %s",
    //        rawPath, baseName, root, inZipPath));
    //    if (inZipPath.startsWith(Path.SEPARATOR)) {
    //      inZipPath = inZipPath.substring(1);
    //    }
    //    LOG.info(String.format("RawPath %s, baseName %s, root %s, inZip 2 %s",
    //        rawPath, baseName, root, inZipPath));
    //
    //    return inZipPath;
    //
    //  }
    //  
    /*
    static boolean setLogLevel(String packageName, String level) {
      Log logger = LogFactory.getLog(packageName);
      if (logger == null) {
        return false;
      }
      // look for: org.apache.commons.logging.impl.SLF4JLocationAwareLog
      LOG.warn("logger class:"+logger.getClass().getName());
      if (logger instanceof Log4JLogger) {
        process(((Log4JLogger) logger).getLogger(), level);
        return true;
      }
      if (logger instanceof Jdk14Logger) {
        process(((Jdk14Logger) logger).getLogger(), level);
        return true;
      }
      return false;
    }
        
    public static void process(org.apache.log4j.Logger log, String level) {
      if (level != null) {
        log.setLevel(org.apache.log4j.Level.toLevel(level));
      }
    }
        
    public static void process(java.util.logging.Logger log, String level) {
      if (level != null) {
        log.setLevel(java.util.logging.Level.parse(level));
      }
    }
    */
}