com.accumulobook.advanced.mapreduce.WordCount.java Source code

Java tutorial

Introduction

Here is the source code for com.accumulobook.advanced.mapreduce.WordCount.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.accumulobook.advanced.mapreduce;

import com.accumulobook.ExampleMiniCluster;
import com.accumulobook.WikipediaConstants;
import com.accumulobook.basic.WikipediaClient;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.accumulo.core.client.BatchWriterConfig;
import org.apache.accumulo.core.client.ClientConfiguration;
import org.apache.accumulo.core.client.Connector;
import org.apache.accumulo.core.client.mapreduce.AccumuloInputFormat;
import org.apache.accumulo.core.client.mapreduce.AccumuloOutputFormat;
import org.apache.accumulo.core.client.security.tokens.PasswordToken;
import org.apache.accumulo.core.data.Key;
import org.apache.accumulo.core.data.Mutation;
import org.apache.accumulo.core.data.Value;
import org.apache.accumulo.core.security.Authorizations;
import org.apache.accumulo.core.util.Pair;
import org.apache.accumulo.core.util.shell.Shell;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.util.Tool;

public class WordCount implements Tool {

    private Configuration conf = null;

    @Override
    public int run(String[] args) throws Exception {

        Job job = Job.getInstance(new Configuration());
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        job.setMapperClass(WordCountMapper.class);
        job.setCombinerClass(WordCountCombiner.class);
        job.setReducerClass(WordCountReducer.class);

        // input
        job.setInputFormatClass(AccumuloInputFormat.class);

        ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

        AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE);
        List<Pair<Text, Text>> columns = new ArrayList<>();
        columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

        AccumuloInputFormat.fetchColumns(job, columns);
        AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
        AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

        // output
        job.setOutputFormatClass(AccumuloOutputFormat.class);

        BatchWriterConfig config = new BatchWriterConfig();

        AccumuloOutputFormat.setBatchWriterOptions(job, config);
        AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
        AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
        AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
        AccumuloOutputFormat.setCreateTables(job, true);

        job.setJarByClass(WordCount.class);

        job.submit();
        return 0;
    }

    @Override
    public void setConf(Configuration conf) {
        this.conf = conf;
    }

    @Override
    public Configuration getConf() {
        if (this.conf == null) {
            this.conf = new Configuration();
        }

        return this.conf;
    }

    public static class WordCountMapper extends Mapper<Key, Value, Text, IntWritable> {

        @Override
        public void map(Key k, Value v, Context context) throws IOException, InterruptedException {

            String text = new String(v.get());

            // count words in article
            HashMap<String, Integer> wordCounts = new HashMap<>();
            for (String word : text.replaceAll("[^a-zA-Z ]", " ").toLowerCase().split("\\s+")) {
                if (!wordCounts.containsKey(word)) {
                    wordCounts.put(word, 0);
                }
                wordCounts.put(word, wordCounts.get(word) + 1);
            }

            for (Map.Entry<String, Integer> e : wordCounts.entrySet()) {
                context.write(new Text(e.getKey()), new IntWritable(e.getValue()));
            }
        }
    }

    public static class WordCountCombiner extends Reducer<Text, IntWritable, Text, IntWritable> {

        @Override
        public void reduce(Text k, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable v : values) {
                sum += v.get();
            }

            context.write(k, new IntWritable(sum));
        }
    }

    public static class WordCountReducer extends Reducer<Text, IntWritable, Text, Mutation> {

        @Override
        public void reduce(Text k, Iterable<IntWritable> values, Context context)
                throws IOException, InterruptedException {
            int sum = 0;
            for (IntWritable v : values) {
                sum += v.get();
            }

            Mutation m = new Mutation(k.toString());
            m.put("count", "", Integer.toString(sum));

            context.write(null, m);
        }
    }

    public static void main(String[] args) throws Exception {

        // startup mini accumulo and ingest data
        System.out.println("Ingesting example wikipedia articles ...");
        Connector conn = ExampleMiniCluster.getConnector();

        WikipediaClient client = new WikipediaClient(conn, new Authorizations());

        client.ingest(WikipediaConstants.HADOOP_PAGES);

        // start mr cluster
        System.out.println("done. starting mini mr cluster ...");
        MiniMRClusterRunner mmrce = new MiniMRClusterRunner();
        mmrce.setup();
        mmrce.start();

        System.out.println("started. running mapreduce files example ...");
        WordCount wc = new WordCount();

        wc.run(new String[] { ExampleMiniCluster.getInstanceName(), ExampleMiniCluster.getZooKeepers(), "root",
                "password" });

        System.out.println("done. Starting shell ...");

        String[] shellArgs = new String[] { "-u", "root", "-p", "password", "-z",
                ExampleMiniCluster.getInstanceName(), ExampleMiniCluster.getZooKeepers() };

        Shell.main(shellArgs);
        mmrce.stop();
        ExampleMiniCluster.shutdown();
    }
}