wordcount.Spark_WordCountEachLine.java Source code

Java tutorial

Introduction

Here is the source code for wordcount.Spark_WordCountEachLine.java

Source

/*
 *
 * ****************
 * Copyright 2015 Tiziano Fagni (tiziano.fagni@isti.cnr.it)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * ******************
 */

package wordcount;

import org.apache.spark.api.java.*;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFlatMapFunction;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;

public class Spark_WordCountEachLine {

    private static Pattern pattern = Pattern
            .compile("([\\s]+)|([\\:\\.\\,\\;\"\\<\\>\\[\\]\\{\\}\\\\/'\\\\&\\#\\*\\(\\)\\=\\?\\^\\!\\|])");

    public static void main(String[] args) {

        if (args.length != 3)
            throw new IllegalArgumentException(
                    "Usage: " + Spark_WordCountEachLine.class.getName() + " <inputDir> <outputDir> <numCores>");

        long startTime = System.currentTimeMillis();

        String inputDir = args[0];
        String outputDir = args[1];
        int numCores = Integer.parseInt(args[2]);

        SparkConf conf = new SparkConf().setAppName("Spark word count");
        conf.setMaster("local[" + numCores + "]");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaRDD<String> textLines = sc.textFile(inputDir + "/*");
        List<scala.Tuple2<String, Integer>> res = textLines
                .flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
                    @Override
                    public Iterable<scala.Tuple2<String, Integer>> call(String line) throws Exception {
                        if (line.isEmpty())
                            return new ArrayList<scala.Tuple2<String, Integer>>();
                        if (line.startsWith("<doc") || line.startsWith("</doc"))
                            return new ArrayList<scala.Tuple2<String, Integer>>();

                        ArrayList<scala.Tuple2<String, Integer>> listValues = new ArrayList<scala.Tuple2<String, Integer>>();
                        String[] a = pattern.split(line);
                        for (int i = 0; i < a.length; i++) {
                            String w = a[i];
                            if (w.isEmpty())
                                continue;
                            w = w.toLowerCase();
                            scala.Tuple2<String, Integer> t = new scala.Tuple2<String, Integer>(w, 1);
                            listValues.add(t);
                        }
                        return listValues;
                    }
                }).reduceByKey(new Function2<Integer, Integer, Integer>() {
                    @Override
                    public Integer call(Integer val1, Integer val2) throws Exception {
                        return val1 + val2;
                    }
                }).collect();

        // Write results.
        StringBuilder sb = new StringBuilder();
        Iterator<scala.Tuple2<String, Integer>> keys = res.iterator();
        while (keys.hasNext()) {
            scala.Tuple2<String, Integer> tuple = keys.next();
            String k = tuple._1();
            int v = tuple._2();
            sb.append("Word: " + k + " Occurrences: " + v + "\n");
        }
        writeTextFile(outputDir + "/results.txt", sb.toString());

        long endTime = System.currentTimeMillis();
        System.out.println("Done! Execution time: " + (endTime - startTime) + " milliseconds.");
    }

    private static void writeTextFile(String filename, String textToWrite) {
        try {
            new File(filename).getParentFile().mkdirs();
            FileWriter writer = new FileWriter(filename, false);
            BufferedWriter bufferedWriter = new BufferedWriter(writer);
            bufferedWriter.write(textToWrite);
            bufferedWriter.close();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }

    }
}