com.aerospike.hadoop.examples.externaljoin.ExternalJoin.java Source code

Java tutorial

Introduction

Here is the source code for com.aerospike.hadoop.examples.externaljoin.ExternalJoin.java

Source

/* 
 * Copyright 2014 Aerospike, Inc.
 *
 * Portions may be licensed to Aerospike, Inc. under one or more
 * contributor license agreements.
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You
 * may obtain a copy of the License at
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

package com.aerospike.hadoop.examples.externaljoin;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;

import java.nio.ByteBuffer;

import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.StringTokenizer;

import org.apache.commons.codec.binary.Hex;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextOutputFormat;

import com.aerospike.client.AerospikeClient;
import com.aerospike.client.Bin;
import com.aerospike.client.Key;
import com.aerospike.client.Record;
import com.aerospike.client.policy.ClientPolicy;
import com.aerospike.client.policy.WritePolicy;
import com.aerospike.client.policy.Policy;

import com.aerospike.hadoop.mapreduce.AerospikeOutputFormat;
import com.aerospike.hadoop.mapreduce.AerospikeRecordWriter;
import com.aerospike.hadoop.mapreduce.AerospikeConfigUtil;

public class ExternalJoin extends Configured implements Tool {

    private static final Log log = LogFactory.getLog(ExternalJoin.class);

    private static final long SESSION_GAP_MSEC = 20 * 60 * 1000;

    // Sample line format:
    // 37518 - - [16/Jun/1998:02:48:36 +0000] \
    // "GET /images/hm_hola.gif HTTP/1.0" 200 2240

    private static final String logEntryRegex = "^([\\d.]+) (\\S+) (\\S+) \\[([\\w:/]+\\s[+\\-]\\d{4})\\] \"(.+?)\" (\\d{3}) (\\S+)";
    private static final Pattern pat = Pattern.compile(logEntryRegex);

    private static final SimpleDateFormat dateTimeParser = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z");

    public static class Map extends MapReduceBase
            implements Mapper<LongWritable, Text, LongWritable, LongWritable> {

        int mapcount = 0;

        public void map(LongWritable key, Text rec, OutputCollector<LongWritable, LongWritable> output,
                Reporter reporter) throws IOException {
            try {
                String line = rec.toString();
                Matcher matcher = pat.matcher(line);
                if (!matcher.matches() || 7 != matcher.groupCount()) {
                    throw new RuntimeException("match failed on: " + line);
                }
                long userid = Long.parseLong(matcher.group(1));
                String tstamp = matcher.group(4);
                ParsePosition pos = new ParsePosition(0);
                Date date = dateTimeParser.parse(tstamp, pos);
                long msec = date.getTime();
                output.collect(new LongWritable(userid), new LongWritable(msec));
            } catch (Exception ex) {
                // log.error("exception in map: " + ex);
            }
        }
    }

    private static class Session implements Writable {
        public long userid;
        public long start;
        public long end;
        public int nhits;
        public int age;
        public int isMale;

        public Session(long userid, long start, long end, int nhits, int age, int isMale) {
            this.userid = userid;
            this.start = start;
            this.end = end;
            this.nhits = nhits;
            this.age = age;
            this.isMale = isMale;
        }

        public void readFields(DataInput in) throws IOException {
            userid = in.readLong();
            start = in.readLong();
            end = in.readLong();
            nhits = in.readInt();
            age = in.readInt();
            isMale = in.readInt();
        }

        public void write(DataOutput out) throws IOException {
            out.writeLong(userid);
            out.writeLong(start);
            out.writeLong(end);
            out.writeInt(nhits);
            out.writeInt(age);
            out.writeInt(isMale);
        }
    }

    public static class Reduce extends MapReduceBase implements Reducer<LongWritable, LongWritable, Text, Session> {

        private Policy policy;
        private AerospikeClient client;
        private String namespace;
        private String setName;

        @Override
        public void configure(JobConf job) {
            String host = AerospikeConfigUtil.getInputHost(job);
            int port = AerospikeConfigUtil.getInputPort(job);

            policy = new Policy();
            policy.timeout = 10000;
            client = new AerospikeClient(host, port);

            namespace = AerospikeConfigUtil.getInputNamespace(job);
            setName = AerospikeConfigUtil.getInputSetName(job);
        }

        @Override
        public void close() {
            client.close();
        }

        public void reduce(LongWritable userid, Iterator<LongWritable> tstamps,
                OutputCollector<Text, Session> output, Reporter reporter) throws IOException {

            // Copy the iterator to an array.
            ArrayList<LongWritable> tsarray = new ArrayList<LongWritable>();
            while (tstamps.hasNext())
                tsarray.add(new LongWritable(tstamps.next().get()));

            // Sort the timestamps.
            Collections.sort(tsarray);

            // Scan the array looking for session boundaries.
            long t0 = 0;
            long session_start = 0;
            long session_end = 0;
            int session_hits = 0;
            for (LongWritable tstamp : tsarray) {
                long tt = tstamp.get();

                // How long since the prior hit?
                long delta = tt - t0;

                // Is this a new session?
                if (delta > SESSION_GAP_MSEC) {

                    // Is there a prior session?
                    if (session_start != 0)
                        collect_session(userid.get(), session_start, session_end, session_hits, output);

                    // Reset for the new session.
                    session_start = tt;
                    session_hits = 0;
                }

                // Extend the current session.
                session_hits += 1;
                session_end = tt;

                // On to the next hit ...
                t0 = tt;
            }

            // Write out the last session.
            if (session_start != 0)
                collect_session(userid.get(), session_start, session_end, session_hits, output);
        }

        private void collect_session(long userid, long start, long end, int nhits,
                OutputCollector<Text, Session> output) throws IOException {

            Key kk = new Key(namespace, setName, userid);
            Record rec = client.get(policy, kk);

            int age = (Integer) rec.bins.get("age");
            int isMale = (Integer) rec.bins.get("isMale");

            try {
                // Generate a sessionid from the hash of the userid and start.
                MessageDigest md = MessageDigest.getInstance("SHA-256");
                md.update(ByteBuffer.allocate(8).putLong(userid).array());
                md.update(ByteBuffer.allocate(8).putLong(start).array());
                String sessid = Hex.encodeHexString(md.digest()).substring(0, 16);

                Session session = new Session(userid, start, end, nhits, age, isMale);

                output.collect(new Text(sessid), session);
            } catch (NoSuchAlgorithmException ex) {
                throw new RuntimeException(ex);
            }
        }
    }

    public static class SessionOutputFormat extends AerospikeOutputFormat<Text, Session> {

        public static class SessionRecordWriter extends AerospikeRecordWriter<Text, Session> {

            public SessionRecordWriter(Configuration cfg, Progressable progressable) {
                super(cfg, progressable);
            }

            @Override
            public void writeAerospike(Text sessid, Session session, AerospikeClient client,
                    WritePolicy writePolicy, String namespace, String setName) throws IOException {
                writePolicy.timeout = 10000;
                Key kk = new Key(namespace, setName, sessid.toString());
                Bin bin0 = new Bin("userid", session.userid);
                Bin bin1 = new Bin("start", session.start);
                Bin bin2 = new Bin("end", session.end);
                Bin bin3 = new Bin("nhits", session.nhits);
                Bin bin4 = new Bin("age", session.age);
                Bin bin5 = new Bin("isMale", session.isMale);
                client.put(writePolicy, kk, bin0, bin1, bin2, bin3, bin4, bin5);
            }
        }

        public RecordWriter<Text, Session> getAerospikeRecordWriter(Configuration conf, Progressable prog) {
            return new SessionRecordWriter(conf, prog);
        }
    }

    public int run(final String[] args) throws Exception {

        log.info("run starting");

        final Configuration conf = getConf();

        JobConf job = new JobConf(conf, ExternalJoin.class);
        job.setJobName("AerospikeExternalJoin");

        job.setMapperClass(Map.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(LongWritable.class);
        // job.setCombinerClass(Reduce.class);  // Reduce changes format.
        job.setReducerClass(Reduce.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Session.class);

        job.setOutputFormat(SessionOutputFormat.class);

        for (int ii = 0; ii < args.length; ++ii)
            FileInputFormat.addInputPath(job, new Path(args[ii]));

        JobClient.runJob(job);

        log.info("finished");
        return 0;
    }

    public static void main(final String[] args) throws Exception {
        System.exit(ToolRunner.run(new ExternalJoin(), args));
    }
}

// Local Variables:
// mode: java
// c-basic-offset: 4
// tab-width: 4
// indent-tabs-mode: nil
// End:
// vim: softtabstop=4:shiftwidth=4:expandtab