de.tuberlin.dima.aim3.assignment1.BookAndAuthorBroadcastJoin.java Source code

Introduction

Here is the source code for de.tuberlin.dima.aim3.assignment1.BookAndAuthorBroadcastJoin.java
Source

/**
 * AIM3 - Scalable Data Mining -  course work
 * Copyright (C) 2014  Sebastian Schelter
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

package de.tuberlin.dima.aim3.assignment1;

import de.tuberlin.dima.aim3.HadoopJob;
import org.apache.hadoop.filecache.DistributedCache;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;

public class BookAndAuthorBroadcastJoin extends HadoopJob {

    @Override
    public int run(String[] args) throws Exception {

        Map<String, String> parsedArgs = parseArgs(args);

        Path authors = new Path(parsedArgs.get("--authors"));
        Path books = new Path(parsedArgs.get("--books"));
        Path outputPath = new Path(parsedArgs.get("--output"));

        //IMPLEMENT ME

        Job broadCastJoin = prepareJob(books, outputPath, TextInputFormat.class, BroadCastMapper.class, Text.class,
                Text.class, TextOutputFormat.class);
        DistributedCache.addArchiveToClassPath(authors, broadCastJoin.getConfiguration(),
                FileSystem.getLocal(broadCastJoin.getConfiguration()));
        broadCastJoin.waitForCompletion(true);

        return 0;
    }

    static class BroadCastMapper extends Mapper<Object, Text, Text, Text> {

        private Map<String, String> loadedAuthorsMap = new HashMap<String, String>();

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {

            // Load the Authors inside  map ONCE ONLY
            BufferedReader bufferedReader = null;

            Path[] loadedAuthors = DistributedCache.getArchiveClassPaths(context.getConfiguration());

            if (loadedAuthors != null || loadedAuthors.length != 0) {
                try {
                    bufferedReader = new BufferedReader(new FileReader(loadedAuthors[0].toString()));
                    String line;
                    while ((line = bufferedReader.readLine()) != null) {

                        String[] splitAuthorsLine = line.split("\t");
                        loadedAuthorsMap.put(splitAuthorsLine[0], splitAuthorsLine[1]);
                    }
                } finally {
                    bufferedReader.close();
                }
            }
        }

        @Override
        protected void map(Object key, Text value, Context context) throws IOException, InterruptedException {

            //((FileSplit) context.getInputSplit()).getPath()

            // perform the join on between authors and books on authors_ID
            String[] splitBooksLine = value.toString().split("\t");
            Set<String> authorIds = loadedAuthorsMap.keySet();
            for (String authorId : authorIds) {

                if (splitBooksLine[0].equals(authorId)) {
                    context.write(new Text(loadedAuthorsMap.get(authorId).toString()),
                            new Text(splitBooksLine[2] + "\t" + splitBooksLine[1]));
                }

            }

        }

    }

}