com.datasalt.pangool.solr.TupleSolrOutputFormatExample.java Source code

Introduction

Here is the source code for com.datasalt.pangool.solr.TupleSolrOutputFormatExample.java
Source

/**
 * Copyright [2012] [Datasalt Systems S.L.]
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.datasalt.pangool.solr;

import java.io.File;
import java.io.IOException;
import java.io.Serializable;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;

import com.datasalt.pangool.PangoolRuntimeException;
import com.datasalt.pangool.io.Fields;
import com.datasalt.pangool.io.ITuple;
import com.datasalt.pangool.io.Schema;
import com.datasalt.pangool.io.Tuple;
import com.datasalt.pangool.tuplemr.IdentityTupleReducer;
import com.datasalt.pangool.tuplemr.TupleMRBuilder;
import com.datasalt.pangool.tuplemr.TupleMapper;
import com.datasalt.pangool.tuplemr.mapred.lib.input.HadoopInputFormat;

/**
 * Example usage of {@TupleSolrOutputFormat} that is also used for unit testing.
 * <p>
 * This example creates three index: one in the main output which contains (user_id, message) pairs of english messages
 * and two other auxiliar indexes with french & spanish messages.
 */
@SuppressWarnings("serial")
public class TupleSolrOutputFormatExample implements Serializable {

    public int run(String input, String output, Configuration conf) throws Exception {
        // Define the intermediate schema: It must match SOLR's schema.xml!
        final Schema schema = new Schema("iSchema", Fields.parse("user_id:string, message:string"));

        TupleMRBuilder job = new TupleMRBuilder(conf);
        job.addIntermediateSchema(schema);
        job.setGroupByFields("user_id");
        // Define the input and its associated mapper.
        // We'll just have a Mapper, reducer will be Identity
        job.addInput(new Path(input), new HadoopInputFormat(TextInputFormat.class),
                new TupleMapper<LongWritable, Text>() {

                    Tuple tuple = new Tuple(schema);

                    @Override
                    public void map(LongWritable key, Text value, TupleMRContext context, Collector collector)
                            throws IOException, InterruptedException {
                        String[] fields = value.toString().split("\t");
                        String language = fields[1];
                        tuple.set("user_id", fields[0]);
                        tuple.set("message", fields[2]);
                        if (language.equals("en")) {
                            // English -> write to main output
                            collector.write(tuple);
                        } else if (language.equals("fr")) {
                            // French -> write to french index
                            collector.getNamedOutput("FR").write(tuple, NullWritable.get());
                        } else if (language.equals("es")) {
                            // Spanish -> write to spanish index
                            collector.getNamedOutput("ES").write(tuple, NullWritable.get());
                        }
                    }
                });
        // Add multi-output: French index
        job.addNamedOutput("FR", new TupleSolrOutputFormat(new File("src/test/resources/solr-fr"), conf),
                ITuple.class, NullWritable.class);
        // Add multi-output: Spanish index
        job.addNamedOutput("ES", new TupleSolrOutputFormat(new File("src/test/resources/solr-es"), conf),
                ITuple.class, NullWritable.class);
        job.setTupleReducer(new IdentityTupleReducer());
        // Add multi-output: English index
        job.setOutput(new Path(output), new TupleSolrOutputFormat(new File("src/test/resources/solr-en"), conf),
                ITuple.class, NullWritable.class);
        Job hadoopJob = job.createJob();
        try {
            hadoopJob.waitForCompletion(true);
            if (!hadoopJob.isSuccessful()) {
                throw new PangoolRuntimeException("Job was not sucessfull");
            }
        } finally {
            job.cleanUpInstanceFiles();
        }
        return 0;
    }
}