com.datasalt.utils.viewbuilder.ShardedSolrDocumentConverter.java Source code

Java tutorial

Introduction

Here is the source code for com.datasalt.utils.viewbuilder.ShardedSolrDocumentConverter.java

Source

/**
 * Copyright [2011] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.datasalt.utils.viewbuilder;

import java.util.Collection;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.log4j.Logger;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.hadoop.SolrDocumentConverter;

/**
 * 
 * The aim of this {@link SolrDocumentConverter} is to generate document ids used as unique keys <uniqueKey></uniqueKey> that are unique in the entire 
 * collection ,i.e there are no repeated ids in the shards that contain the same collection(core). This is a restriction of Solr'<a href="http://wiki.apache.org/solr/DistributedSearch">DistributedSearch</a>
 * 
 * 
 * @author epalace
 *
 * @param <KEY_TYPE>
 * @param <VALUE_TYPE>
 */
public abstract class ShardedSolrDocumentConverter<KEY_TYPE, VALUE_TYPE>
        extends SolrDocumentConverter<KEY_TYPE, VALUE_TYPE> {

    public static final String PARTITIONER_CONF = "sharded_solr.partitioner";

    private static final Logger log = Logger.getLogger(ShardedSolrDocumentConverter.class);

    private Integer currentShard;
    private int numShards;
    private int shardNumberShift;
    private Long lastRecordId = null;
    private Partitioner<KEY_TYPE, VALUE_TYPE> partitioner;

    @Override
    @SuppressWarnings({ "rawtypes", "unchecked" })
    public void setConf(Configuration conf) {
        super.setConf(conf);
        if (conf != null) {
            JobConf jobConf = (JobConf) conf;
            numShards = jobConf.getNumReduceTasks();
            shardNumberShift = (int) Math.ceil(numDigits(Long.MAX_VALUE)) - 1
                    - (int) Math.ceil(numDigits(numShards));
            log.info("Num shards : " + numShards);

            Class partitionerClass;
            try {
                partitionerClass = Class.forName(jobConf.get(PARTITIONER_CONF));
            } catch (ClassNotFoundException e) {
                e.printStackTrace();
                throw new RuntimeException(
                        "Partitioner not set.Use conf.set(ShardedSolrDocument.PARTITIONER_CONF,partitionerClassName)",
                        e);
            }
            log.info("Partitioner class : " + partitionerClass);
            partitioner = (Partitioner<KEY_TYPE, VALUE_TYPE>) ReflectionUtils.newInstance(partitionerClass, conf);
        }
    }

    @Override
    public Collection<SolrInputDocument> convert(KEY_TYPE key, VALUE_TYPE value) {
        if (currentShard == null) {
            // a partitioner is used to calculate which reducer task is running.
            // this is done just once, because all the records received in this reducer have the same reducer number.
            currentShard = partitioner.getPartition(key, value, numShards);
            lastRecordId = (((long) currentShard * (long) Math.pow(10l, shardNumberShift)));
            log.info("Current shard:" + currentShard + " First id : " + lastRecordId);
        }
        return null;
    }

    public long generateUniqueKeyId() {
        return ++lastRecordId; //TODO check if record Id is too high ??
    }

    public static double numBits(long num) {
        return Math.log(num) / Math.log(2);
    }

    public static double numDigits(long num) {
        return Math.log10(num);
    }

}