com.alexholmes.hadooputils.combine.avro.mapreduce.CombineAvroKeyValueInputFormatTest.java Source code

Java tutorial

Introduction

Here is the source code for com.alexholmes.hadooputils.combine.avro.mapreduce.CombineAvroKeyValueInputFormatTest.java

Source

/*
 * Copyright 2013 Alex Holmes
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.alexholmes.hadooputils.combine.avro.mapreduce;

import com.alexholmes.hadooputils.util.AvroFiles;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.hadoop.io.AvroKeyValue;
import org.apache.avro.io.DatumReader;
import org.apache.avro.mapred.AvroKey;
import org.apache.avro.mapred.AvroValue;
import org.apache.avro.mapreduce.AvroJob;
import org.apache.avro.mapreduce.AvroKeyValueInputFormat;
import org.apache.avro.mapreduce.AvroKeyValueOutputFormat;
import org.apache.avro.specific.SpecificDatumReader;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;

import static org.junit.Assert.*;

/**
 */
public class CombineAvroKeyValueInputFormatTest {
    @Rule
    public TemporaryFolder mTempDir = new TemporaryFolder();

    /**
     * Creates an Avro file of <docid, text> pairs to use for test input:
     *
     * +-----+-----------------------+
     * | KEY | VALUE                 |
     * +-----+-----------------------+
     * | 1   | "apple banana carrot" |
     * | 2   | "apple banana"        |
     * | 3   | "apple"               |
     * +-----+-----------------------+
     *
     * @return The avro file.
     */
    private File createInputFile() throws IOException {
        Schema keyValueSchema = AvroKeyValue.getSchema(Schema.create(Schema.Type.INT),
                Schema.create(Schema.Type.STRING));

        AvroKeyValue<Integer, CharSequence> record1 = new AvroKeyValue<Integer, CharSequence>(
                new GenericData.Record(keyValueSchema));
        record1.setKey(1);
        record1.setValue("apple banana carrot");

        AvroKeyValue<Integer, CharSequence> record2 = new AvroKeyValue<Integer, CharSequence>(
                new GenericData.Record(keyValueSchema));
        record2.setKey(2);
        record2.setValue("apple banana");

        AvroKeyValue<Integer, CharSequence> record3 = new AvroKeyValue<Integer, CharSequence>(
                new GenericData.Record(keyValueSchema));
        record3.setKey(3);
        record3.setValue("apple");

        return AvroFiles.createFile(new File(mTempDir.getRoot(), "inputKeyValues.avro"), keyValueSchema,
                record1.get(), record2.get(), record3.get());
    }

    /** A mapper for indexing documents. */
    public static class IndexMapper extends Mapper<AvroKey<Integer>, AvroValue<CharSequence>, Text, IntWritable> {
        @Override
        protected void map(AvroKey<Integer> docid, AvroValue<CharSequence> body, Context context)
                throws IOException, InterruptedException {
            for (String token : body.datum().toString().split(" ")) {
                context.write(new Text(token), new IntWritable(docid.datum()));
            }
        }
    }

    /** A reducer for aggregating token to docid mapping into a hitlist. */
    public static class IndexReducer extends Reducer<Text, IntWritable, Text, AvroValue<List<Integer>>> {
        @Override
        protected void reduce(Text token, Iterable<IntWritable> docids, Context context)
                throws IOException, InterruptedException {
            List<Integer> hitlist = new ArrayList<Integer>();
            for (IntWritable docid : docids) {
                hitlist.add(docid.get());
            }
            context.write(token, new AvroValue<List<Integer>>(hitlist));
        }
    }

    @Test
    public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException {
        // Create a test input file.
        File inputFile = createInputFile();

        // Configure the job input.
        Job job = new Job();
        FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath()));
        job.setInputFormatClass(CombineAvroKeyValueInputFormat.class);
        AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT));
        AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING));

        // Configure a mapper.
        job.setMapperClass(IndexMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        // Configure a reducer.
        job.setReducerClass(IndexReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(AvroValue.class);
        AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT)));

        // Configure the output format.
        job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
        Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index");
        FileOutputFormat.setOutputPath(job, outputPath);

        // Run the job.
        assertTrue(job.waitForCompletion(true));

        // Verify that the output Avro container file as the expected data.
        File avroFile = new File(outputPath.toString(), "part-r-00000.avro");
        DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>(AvroKeyValue
                .getSchema(Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT))));
        DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
        assertTrue(avroFileReader.hasNext());

        AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>(
                avroFileReader.next());
        assertNotNull(appleRecord.get());
        assertEquals("apple", appleRecord.getKey().toString());
        List<Integer> appleDocs = appleRecord.getValue();
        assertEquals(3, appleDocs.size());
        assertTrue(appleDocs.contains(1));
        assertTrue(appleDocs.contains(2));
        assertTrue(appleDocs.contains(3));

        assertTrue(avroFileReader.hasNext());
        AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>(
                avroFileReader.next());
        assertNotNull(bananaRecord.get());
        assertEquals("banana", bananaRecord.getKey().toString());
        List<Integer> bananaDocs = bananaRecord.getValue();
        assertEquals(2, bananaDocs.size());
        assertTrue(bananaDocs.contains(1));
        assertTrue(bananaDocs.contains(2));

        assertTrue(avroFileReader.hasNext());
        AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>(
                avroFileReader.next());
        assertEquals("carrot", carrotRecord.getKey().toString());
        List<Integer> carrotDocs = carrotRecord.getValue();
        assertEquals(1, carrotDocs.size());
        assertTrue(carrotDocs.contains(1));

        assertFalse(avroFileReader.hasNext());
        avroFileReader.close();
    }
}