org.bigsolr.hadoop.SolrInputFormat.java Source code

Introduction

Here is the source code for org.bigsolr.hadoop.SolrInputFormat.java
Source

/*
 * Licensed to Taka Shinagawa under one or more contributor license agreements.
 * See the NOTICE file distributed with this work for additional information regarding
 * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the License. You may obtain a
 * copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software distributed under the License
 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied. See the License for the specific language governing permissions and limitations under
 * the License.
 */

package org.bigsolr.hadoop;

import java.io.IOException;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputFormat;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.RecordReader;

import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.client.solrj.response.QueryResponse;

import org.bigsolr.hadoop.SolrOperations;

import org.apache.log4j.Logger;

public class SolrInputFormat extends InputFormat<NullWritable, SolrRecord>
        implements org.apache.hadoop.mapred.InputFormat<NullWritable, SolrRecord> {

    private static Logger log = Logger.getLogger(SolrInputFormat.class);

    private static final String SOLR_QUERY = "solr.query";
    private static final String SERVER_MODE = "solr.server.mode";
    private static final String SERVER_URL = "solr.server.url";
    private static final String COLLECTION_NAME = "solr.server.collection";
    private static final String FIELDS = "solr.server.fields";
    private static final String ID_FIELD = "id";

    // New API
    @Override
    public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
        log.info("SolrInputFormat -> getSplits");

        Configuration conf = context.getConfiguration();
        String collectionName = conf.get(COLLECTION_NAME);
        int numSplits = context.getNumReduceTasks();
        SolrServer solr = SolrOperations.getSolrServer(conf);

        final SolrQuery solrQuery = new SolrQuery(conf.get(SOLR_QUERY));
        solrQuery.setFields(ID_FIELD);
        solrQuery.setRows(50);
        solrQuery.set("collection", collectionName);
        solrQuery.setStart(0);

        QueryResponse response;
        try {
            response = solr.query(solrQuery);
        } catch (final SolrServerException e) {
            throw new IOException(e);
        }

        int numResults = (int) response.getResults().getNumFound();
        int numDocsPerSplit = (numResults / numSplits);
        int currentDoc = 0;

        List<InputSplit> splits = new ArrayList<InputSplit>();
        for (int i = 0; i < numSplits - 1; i++) {
            splits.add(new SolrInputSplit(currentDoc, numDocsPerSplit));
            currentDoc += numDocsPerSplit;
        }
        splits.add(new SolrInputSplit(currentDoc, numResults - currentDoc));

        return splits;
    }

    // Old API
    @Override
    public org.apache.hadoop.mapred.InputSplit[] getSplits(org.apache.hadoop.mapred.JobConf conf, int numSplits)
            throws IOException {
        log.info("SolrInputFormat -> getSplits");
        String collectionName = conf.get(COLLECTION_NAME);
        SolrServer solr = SolrOperations.getSolrServer(conf);

        final SolrQuery solrQuery = new SolrQuery(conf.get(SOLR_QUERY));
        solrQuery.setFields(ID_FIELD);
        solrQuery.setRows(50);
        solrQuery.set("collection", collectionName);
        solrQuery.setStart(0);

        QueryResponse response;
        try {
            response = solr.query(solrQuery);
        } catch (final SolrServerException e) {
            throw new IOException(e);
        }

        int numResults = (int) response.getResults().getNumFound();
        int numDocsPerSplit = (numResults / numSplits);
        int currentDoc = 0;

        List<InputSplit> splits = new ArrayList<InputSplit>();
        for (int i = 0; i < numSplits - 1; i++) {
            splits.add(new SolrInputSplit(currentDoc, numDocsPerSplit));
            currentDoc += numDocsPerSplit;
        }
        splits.add(new SolrInputSplit(currentDoc, numResults - currentDoc));

        return splits.toArray(new SolrInputSplit[splits.size()]);
    }

    // New API
    @Override
    public RecordReader<NullWritable, SolrRecord> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {

        log.info("SolrInputFormat -> createRecordReader");

        Configuration conf = context.getConfiguration();
        org.apache.hadoop.mapred.Reporter reporter = null; // Need to implement with heartbeat

        String collectionName = conf.get(COLLECTION_NAME);
        String fields = conf.get(FIELDS);
        SolrServer solr = SolrOperations.getSolrServer(conf);

        SolrInputSplit solrSplit = (SolrInputSplit) split;
        final int numDocs = (int) solrSplit.getLength();

        final SolrQuery solrQuery = new SolrQuery(conf.get(SOLR_QUERY));

        solrQuery.setFields(fields);
        solrQuery.set("collection", collectionName);
        solrQuery.setStart(solrSplit.getDocBegin());
        solrQuery.setRows(numDocs);

        QueryResponse response;
        try {
            response = solr.query(solrQuery);
        } catch (final SolrServerException e) {
            throw new IOException(e);
        }

        final SolrDocumentList solrDocs = response.getResults();
        return new SolrRecordReader(solrDocs, numDocs);
    }

    // Old API
    @Override
    public org.apache.hadoop.mapred.RecordReader<NullWritable, SolrRecord> getRecordReader(
            org.apache.hadoop.mapred.InputSplit split, org.apache.hadoop.mapred.JobConf conf,
            org.apache.hadoop.mapred.Reporter reporter) throws IOException {

        log.info("SolrInputFormat -> getRecordReader");

        String collectionName = conf.get(COLLECTION_NAME);
        String fields = conf.get(FIELDS);
        SolrServer solr = SolrOperations.getSolrServer(conf);
        int numDocs = 0;

        SolrInputSplit solrSplit = (SolrInputSplit) split;
        try {
            numDocs = (int) solrSplit.getLength();
        } catch (final IOException e) {
            throw new IOException(e);
        }

        final SolrQuery solrQuery = new SolrQuery(conf.get(SOLR_QUERY));
        solrQuery.setFields(fields);
        solrQuery.set("collection", collectionName); // Added
        solrQuery.setStart(solrSplit.getDocBegin());
        solrQuery.setRows(numDocs);

        QueryResponse response = null;
        try {
            response = solr.query(solrQuery);
        } catch (final SolrServerException e) {
            throw new IOException(e);
        }

        final SolrDocumentList solrDocs = response.getResults();
        return new SolrRecordReader(solrDocs, numDocs);
    }

}