com.scaleunlimited.helpful.tools.AnalyzeEmail.java Source code

Introduction

Here is the source code for com.scaleunlimited.helpful.tools.AnalyzeEmail.java
Source

/*
 * Copyright 2009-2012 Scale Unlimited
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */
package com.scaleunlimited.helpful.tools;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashMap;
import java.util.Properties;

import org.apache.hadoop.mapred.ClusterStatus;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.kohsuke.args4j.CmdLineException;
import org.kohsuke.args4j.CmdLineParser;

import bixo.config.FetcherPolicy;
import bixo.config.UserAgent;
import bixo.datum.FetchedDatum;
import bixo.datum.StatusDatum;
import bixo.datum.UrlDatum;
import bixo.fetcher.BaseFetcher;
import bixo.fetcher.SimpleHttpFetcher;
import bixo.operations.BaseScoreGenerator;
import bixo.operations.FixedScoreGenerator;
import bixo.pipes.FetchPipe;
import cascading.flow.Flow;
import cascading.flow.FlowConnector;
import cascading.flow.FlowProcess;
import cascading.flow.MultiMapReducePlanner;
import cascading.flow.PlannerException;
import cascading.operation.BaseOperation;
import cascading.operation.Function;
import cascading.operation.FunctionCall;
import cascading.operation.expression.ExpressionFilter;
import cascading.pipe.Each;
import cascading.pipe.Every;
import cascading.pipe.GroupBy;
import cascading.pipe.Pipe;
import cascading.pipe.SubAssembly;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tap.Tap;
import cascading.tuple.Fields;

import com.bixolabs.cascading.NullContext;
import com.scaleunlimited.helpful.operations.CalcMessageScoreBuffer;
import com.scaleunlimited.helpful.operations.FieldNames;
import com.scaleunlimited.helpful.operations.MboxSplitterFunction;
import com.scaleunlimited.helpful.operations.ParseEmailFunction;
import com.scaleunlimited.helpful.operations.ParseModMboxPageFunction;
import com.scaleunlimited.helpful.operations.SumScoresBuffer;

public class AnalyzeEmail {
    private static final Logger LOGGER = Logger.getLogger(AnalyzeEmail.class);

    private static final String WEB_ADDRESS = "http://wiki.github.com/bixo/bixo/bixocrawler";

    private static final String EMAIL_ADDRESS = "bixo-dev@yahoogroups.com";

    private static final int MAX_CONTENT_SIZE = 8 * 1024 * 1024;

    private static final int MAX_THREADS = 1;
    private static final int NUM_REDUCERS = 1;

    private static final String MBOX_PAGE_STATUS_PIPE_NAME = "mbox page fetch status pipe";
    private static final String SPLITTER_PIPE_NAME = "Split emails pipe";
    private static final String ANALYZER_PIPE_NAME = "Analyze emails pipe";

    @SuppressWarnings("serial")
    private static class LoadUrlFunction extends BaseOperation<NullContext> implements Function<NullContext> {
        public LoadUrlFunction() {
            super(UrlDatum.FIELDS);
        }

        @Override
        public void operate(FlowProcess process, FunctionCall<NullContext> funcCall) {
            String url = funcCall.getArguments().getString("line");
            if ((url.length() == 0) || (url.startsWith("#"))) {
                return;
            }

            try {
                // Validate the URL
                new URL(url);
                UrlDatum urlDatum = new UrlDatum(url);
                funcCall.getOutputCollector().add(urlDatum.getTuple());
            } catch (MalformedURLException e) {
                LOGGER.error("Invalid URL in input data file: " + url);
            }
        }
    }

    @SuppressWarnings("serial")
    private static class SplitEmails extends SubAssembly {

        public SplitEmails(FetchPipe fetchPipe) {
            Pipe splitPipe = new Pipe(SPLITTER_PIPE_NAME, fetchPipe.getContentTailPipe());
            splitPipe = new Each(splitPipe, new MboxSplitterFunction());
            // TODO KKr - code currently relies on splitPipe being first tail pipe.
            setTails(splitPipe, fetchPipe.getStatusTailPipe());
        }
    }

    private static void printUsageAndExit(CmdLineParser parser) {
        parser.printUsage(System.err);
        System.exit(-1);
    }

    private static JobConf getDefaultJobConf() throws IOException {
        JobClient jobClient = new JobClient(new JobConf());
        ClusterStatus status = jobClient.getClusterStatus();
        int trackers = status.getTaskTrackers();

        JobConf conf = new JobConf();
        conf.setNumMapTasks(trackers * 10);

        conf.setNumReduceTasks((trackers * conf.getInt("mapred.tasktracker.reduce.tasks.maximum", 2)));

        conf.setMapSpeculativeExecution(false);
        conf.setReduceSpeculativeExecution(false);
        conf.set("mapred.child.java.opts", "-server -Xmx512m -Xss128k");

        // Should match the value used for Xss above. Note no 'k' suffix for the ulimit command.
        // New support that one day will be in Hadoop.
        conf.set("mapred.child.ulimit.stack", "128");

        return conf;
    }

    private static Properties getDefaultProperties(AnalyzeEmailOptions options, JobConf conf) throws IOException {
        Properties properties = new Properties();

        // Use special Cascading hack to control logging levels
        if (options.isDebugLogging()) {
            properties.put("log4j.logger", "cascading=DEBUG,sharethis=DEBUG,bixo=TRACE");
        } else {
            properties.put("log4j.logger", "cascading=INFO,sharethis=INFO,bixo=INFO");
        }

        FlowConnector.setApplicationJarClass(properties, AnalyzeEmail.class);

        // Propagate properties into the Hadoop JobConf
        MultiMapReducePlanner.setJobConf(properties, conf);

        return properties;
    }

    /**
     * @param args
     */
    public static void main(String[] args) {
        AnalyzeEmailOptions options = new AnalyzeEmailOptions();
        CmdLineParser parser = new CmdLineParser(options);

        try {
            parser.parseArgument(args);
        } catch (CmdLineException e) {
            System.err.println(e.getMessage());
            printUsageAndExit(parser);
        }

        String inputFileName = options.getInputFile();
        String outputDirName = options.getOutputDir();

        try {
            // Create the input (source tap), which is just a text file reader
            Tap sourceTap = new Hfs(new TextLine(), inputFileName);

            // Create the sub-assembly that runs the fetch job
            UserAgent userAgent = new UserAgent(options.getAgentName(), EMAIL_ADDRESS, WEB_ADDRESS);
            Pipe importPipe = new Each("url importer", new Fields("line"), new LoadUrlFunction());

            BaseScoreGenerator scorer = new FixedScoreGenerator();

            BaseFetcher fetcher = new SimpleHttpFetcher(MAX_THREADS, userAgent);
            FetchPipe fetchPagePipe = new FetchPipe(importPipe, scorer, fetcher, NUM_REDUCERS);

            // Here's the pipe that will output UrlDatum tuples, by extracting URLs from the mod_mbox-generated page.
            Pipe mboxPagePipe = new Each(fetchPagePipe.getContentTailPipe(), new ParseModMboxPageFunction(),
                    Fields.RESULTS);

            // Create a named pipe for the status of the mod_mbox-generated pages.
            Pipe mboxPageStatusPipe = new Pipe(MBOX_PAGE_STATUS_PIPE_NAME, fetchPagePipe.getStatusTailPipe());

            // Set up appropriate FetcherPolicy, where we increase the max content size (since mailbox files
            // can be big, e.g. 4MB).
            FetcherPolicy defaultPolicy = new FetcherPolicy();
            defaultPolicy.setMaxContentSize(MAX_CONTENT_SIZE);
            fetcher = new SimpleHttpFetcher(MAX_THREADS, defaultPolicy, userAgent);

            // We can create the fetch pipe, and set up our Mbox splitter to run on content.
            FetchPipe fetchMboxPipe = new FetchPipe(mboxPagePipe, scorer, fetcher, NUM_REDUCERS);
            SplitEmails splitterPipe = new SplitEmails(fetchMboxPipe);

            // Now create the pipe that's going to analyze the emails we get after splitting them up.
            Pipe analysisPipe = new Pipe(ANALYZER_PIPE_NAME, splitterPipe.getTails()[0]);
            analysisPipe = new Each(analysisPipe, new ParseEmailFunction());

            // We'll get output that has ANALYZED_EMAIL_FIELDS in it. We want to group by
            // the message-id field, and then do an aggregation on that of the scores.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.MESSAGE_ID));
            analysisPipe = new Every(analysisPipe, new CalcMessageScoreBuffer(), Fields.RESULTS);

            // Now we want to sum the scores for each user, which is another grouping/summing.
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.EMAIL_ADDRESS));
            analysisPipe = new Every(analysisPipe, new SumScoresBuffer(), Fields.RESULTS);

            // Let's filter out anybody with an uninteresting score.
            ExpressionFilter filter = new ExpressionFilter(String.format("%s <= 0.0", FieldNames.SUMMED_SCORE),
                    Double.class);
            analysisPipe = new Each(analysisPipe, filter);

            // And let's sort in reverse order (high to low score)
            analysisPipe = new GroupBy(analysisPipe, new Fields(FieldNames.SUMMED_SCORE), true);

            // Create the sink taps
            Tap pageStatusSinkTap = new Hfs(new TextLine(StatusDatum.FIELDS.size()), outputDirName + "/page-status",
                    true);
            Tap mboxStatusSinkTap = new Hfs(new TextLine(StatusDatum.FIELDS.size()), outputDirName + "/mbox-status",
                    true);
            Tap contentSinkTap = new Hfs(new SequenceFile(FetchedDatum.FIELDS), outputDirName + "/content", true);
            Tap analyzerSinkTap = new Hfs(new TextLine(), outputDirName + "/analysis", true);

            HashMap<String, Tap> sinkTapMap = new HashMap<String, Tap>(2);
            sinkTapMap.put(MBOX_PAGE_STATUS_PIPE_NAME, pageStatusSinkTap);
            sinkTapMap.put(FetchPipe.STATUS_PIPE_NAME, mboxStatusSinkTap);
            sinkTapMap.put(SPLITTER_PIPE_NAME, contentSinkTap);
            sinkTapMap.put(ANALYZER_PIPE_NAME, analyzerSinkTap);

            LOGGER.info("Running fetch job with " + options);

            // Finally we can run it.
            JobConf conf = getDefaultJobConf();
            FlowConnector flowConnector = new FlowConnector(getDefaultProperties(options, conf));
            Flow flow = flowConnector.connect(sourceTap, sinkTapMap, splitterPipe, mboxPageStatusPipe,
                    analysisPipe);
            flow.writeDOT("build/goodFlow.dot");
            flow.complete();
        } catch (PlannerException e) {
            System.err.println("PlannerException running AnalyzeEmail: " + e.getMessage());
            e.writeDOT("build/failedFlow.dot");
            System.exit(-1);
        } catch (Throwable t) {
            System.err.println("Exception running AnalyzeEmail: " + t.getMessage());
            t.printStackTrace(System.err);
            System.exit(-1);
        }

    }

}