org.apache.avro.mapred.TestAvroMultipleInputs.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.avro.mapred.TestAvroMultipleInputs.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.avro.mapred;

import java.io.IOException;
import java.io.File;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.BufferedInputStream;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.Reporter;
import org.apache.avro.Schema;
import org.apache.avro.io.DatumReader;
import org.apache.avro.io.DatumWriter;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.file.DataFileStream;
import org.apache.avro.reflect.ReflectData;
import org.apache.avro.reflect.ReflectDatumWriter;
import org.apache.avro.reflect.ReflectDatumReader;
import org.junit.Test;

import static org.junit.Assert.*;

public class TestAvroMultipleInputs {

    /** The input-1 record. */
    public static class NamesRecord {
        private int id = -1;
        private CharSequence name = "";

        public NamesRecord() {
        }

        public NamesRecord(int id, CharSequence name) {
            this.id = id;
            this.name = name;
        }

        @Override
        public String toString() {
            return id + "\t" + name;
        }
    }

    /** The input-2 record. */
    public static class BalancesRecord {
        private int id = -1;
        private long balance = 0L;

        public BalancesRecord() {
        }

        public BalancesRecord(int id, long balance) {
            this.id = id;
            this.balance = balance;
        }

        @Override
        public String toString() {
            return id + "\t" + balance;
        }
    }

    /** The map output key record. */
    public static class KeyRecord {
        private int id = -1;

        public KeyRecord() {
        }

        public KeyRecord(int id) {
            this.id = id;
        }

        @Override
        public String toString() {
            return ((Integer) id).toString();
        }
    }

    /** The common map output value record.
     *  Carries a tag specifying what source
     *  record type was.
     */
    public static class JoinableRecord {
        private int id = -1;
        private CharSequence name = "";
        private long balance = 0L;
        private CharSequence recType = "";

        public JoinableRecord() {
        }

        public JoinableRecord(CharSequence recType, int id, CharSequence name, long balance) {
            this.id = id;
            this.recType = recType;
            this.name = name;
            this.balance = balance;
        }

        @Override
        public String toString() {
            return recType.toString();
        }
    }

    /** The output, combined record. */
    public static class CompleteRecord {
        private int id = -1;
        private CharSequence name = "";
        private long balance = 0L;

        public CompleteRecord() {
        }

        public CompleteRecord(int id, CharSequence name, long balance) {
            this.name = name;
            this.id = id;
            this.balance = balance;
        }

        void setId(int id) {
            this.id = id;
        };

        void setName(CharSequence name) {
            this.name = name;
        };

        void setBalance(long balance) {
            this.balance = balance;
        };

        @Override
        public String toString() {
            return id + "\t" + name + "\t" + balance;
        }
    }

    public static class NamesMapImpl extends AvroMapper<NamesRecord, Pair<KeyRecord, JoinableRecord>> {

        @Override
        public void map(NamesRecord nameRecord, AvroCollector<Pair<KeyRecord, JoinableRecord>> collector,
                Reporter reporter) throws IOException {
            collector.collect(new Pair<KeyRecord, JoinableRecord>(new KeyRecord(nameRecord.id),
                    new JoinableRecord(nameRecord.getClass().getName(), nameRecord.id, nameRecord.name, -1L)));
        }

    }

    public static class BalancesMapImpl extends AvroMapper<BalancesRecord, Pair<KeyRecord, JoinableRecord>> {

        @Override
        public void map(BalancesRecord balanceRecord, AvroCollector<Pair<KeyRecord, JoinableRecord>> collector,
                Reporter reporter) throws IOException {
            collector.collect(
                    new Pair<KeyRecord, JoinableRecord>(new KeyRecord(balanceRecord.id), new JoinableRecord(
                            balanceRecord.getClass().getName(), balanceRecord.id, "", balanceRecord.balance)));
        }

    }

    public static class ReduceImpl extends AvroReducer<KeyRecord, JoinableRecord, CompleteRecord> {

        @Override
        public void reduce(KeyRecord ID, Iterable<JoinableRecord> joinables,
                AvroCollector<CompleteRecord> collector, Reporter reporter) throws IOException {
            CompleteRecord rec = new CompleteRecord();
            for (JoinableRecord joinable : joinables) {
                rec.setId(joinable.id);
                if (joinable.recType.toString().contains("NamesRecord")) {
                    rec.setName(joinable.name);
                } else {
                    rec.setBalance(joinable.balance);
                }
            }
            collector.collect(rec);
        }

    }

    @Test
    public void testJob() throws Exception {
        JobConf job = new JobConf();
        String dir = System.getProperty("test.dir", ".") + "target/testAvroMultipleInputs";
        Path inputPath1 = new Path(dir + "/in1");
        Path inputPath2 = new Path(dir + "/in2");
        Path outputPath = new Path(dir + "/out");

        outputPath.getFileSystem(job).delete(outputPath, true);
        inputPath1.getFileSystem(job).delete(inputPath1, true);
        inputPath2.getFileSystem(job).delete(inputPath2, true);

        writeNamesFiles(new File(inputPath1.toUri().getPath()));
        writeBalancesFiles(new File(inputPath2.toUri().getPath()));

        job.setJobName("multiple-inputs-join");
        AvroMultipleInputs.addInputPath(job, inputPath1, NamesMapImpl.class,
                ReflectData.get().getSchema(NamesRecord.class));
        AvroMultipleInputs.addInputPath(job, inputPath2, BalancesMapImpl.class,
                ReflectData.get().getSchema(BalancesRecord.class));

        Schema keySchema = ReflectData.get().getSchema(KeyRecord.class);
        Schema valueSchema = ReflectData.get().getSchema(JoinableRecord.class);
        AvroJob.setMapOutputSchema(job, Pair.getPairSchema(keySchema, valueSchema));
        AvroJob.setOutputSchema(job, ReflectData.get().getSchema(CompleteRecord.class));

        AvroJob.setReducerClass(job, ReduceImpl.class);
        job.setNumReduceTasks(1);

        FileOutputFormat.setOutputPath(job, outputPath);

        AvroJob.setReflect(job);

        JobClient.runJob(job);

        validateCompleteFile(new File(new File(dir, "out"), "part-00000.avro"));
    }

    /**
     * Writes a "names.avro" file with five sequential <id, name> pairs.
     */
    private void writeNamesFiles(File dir) throws IOException {
        DatumWriter<NamesRecord> writer = new ReflectDatumWriter<NamesRecord>();
        DataFileWriter<NamesRecord> out = new DataFileWriter<NamesRecord>(writer);
        File namesFile = new File(dir + "/names.avro");
        dir.mkdirs();
        out.create(ReflectData.get().getSchema(NamesRecord.class), namesFile);
        for (int i = 0; i < 5; i++)
            out.append(new NamesRecord(i, "record" + i));
        out.close();
    }

    /**
     * Writes a "balances.avro" file with five sequential <id, balance> pairs.
     */
    private void writeBalancesFiles(File dir) throws IOException {
        DatumWriter<BalancesRecord> writer = new ReflectDatumWriter<BalancesRecord>();
        DataFileWriter<BalancesRecord> out = new DataFileWriter<BalancesRecord>(writer);
        File namesFile = new File(dir + "/balances.avro");
        dir.mkdirs();
        out.create(ReflectData.get().getSchema(BalancesRecord.class), namesFile);
        for (int i = 0; i < 5; i++)
            out.append(new BalancesRecord(i, (long) i + 100));
        out.close();
    }

    private void validateCompleteFile(File file) throws Exception {
        DatumReader<CompleteRecord> reader = new ReflectDatumReader<CompleteRecord>();
        InputStream in = new BufferedInputStream(new FileInputStream(file));
        DataFileStream<CompleteRecord> records = new DataFileStream<CompleteRecord>(in, reader);
        int numRecs = 0;
        for (CompleteRecord rec : records) {
            assertEquals(rec.id, numRecs);
            assertEquals(rec.balance - 100, rec.id);
            assertEquals(rec.name, "record" + rec.id);
            numRecs++;
        }
        records.close();
        assertEquals(5, numRecs);
    }

}