org.pooledtimeseries.healthcheck.CheckCartesianProductSeqFile.java Source code

Java tutorial

Introduction

Here is the source code for org.pooledtimeseries.healthcheck.CheckCartesianProductSeqFile.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.pooledtimeseries.healthcheck;

import java.io.IOException;
import java.util.Iterator;
import java.util.List;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.pooledtimeseries.FeatureVector;
import org.pooledtimeseries.cartesian.CartesianInputFormat;
import org.pooledtimeseries.seqfile.TextVectorsToSequenceFile;
import org.pooledtimeseries.util.PoTSerialiser;
import org.pooledtimeseries.util.ReadSeqFileUtil;

/**
 * Program for verifying Sequence File generated by {@link TextVectorsToSequenceFile}<br/>
 * If SeqFile is correct logs for this job will have printed correct keys and Size<br/>
 * Output of this job will have 2 records- <br/>
 * - Number of pairs with similar key
 * - Number of pairs with different keys
 */
public class CheckCartesianProductSeqFile {

    public static class CartesianMapper extends MapReduceBase
            implements Mapper<Text, BytesWritable, Text, IntWritable> {

        private Text simkey = new Text("simkey");
        private Text diskey = new Text("diskey");
        private static final IntWritable one = new IntWritable(1);

        public void map(Text key, BytesWritable value, OutputCollector<Text, IntWritable> output, Reporter reporter)
                throws IOException {
            // System.out.println(value);
            System.out.println(key);
            System.out.println("");

            System.out.println("Size- " + ((List<FeatureVector>) PoTSerialiser.getObject(value.getBytes())).size());

            System.out.println();
            // If the two values are equal add one to output
            String[] files = ReadSeqFileUtil.getFileNames(key);
            if (files[0].equals(files[1])) {
                output.collect(simkey, one);
            } else {
                output.collect(diskey, one);
            }

        }
    }

    public static class CartesianReducer extends MapReduceBase implements Reducer<Text, IntWritable, Text, Text> {
        private Text outputVal = new Text();

        public void reduce(Text key, Iterator<IntWritable> values, OutputCollector<Text, Text> output,
                Reporter reporter) throws IOException {
            int sum = 0;
            while (values.hasNext()) {
                sum += values.next().get();
            }
            outputVal.set("" + sum);
            output.collect(key, outputVal);
        }

    }

    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

        long start = System.currentTimeMillis();
        JobConf conf = new JobConf("Cartesian Product");
        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
        if (otherArgs.length != 2) {
            System.err.println("Usage: CheckCartesianProductSeqFile <input sequence file> <out>");
            System.exit(1);
        }

        // Configure the join type
        conf.setJarByClass(CheckCartesianProductSeqFile.class);

        conf.setMapperClass(CartesianMapper.class);
        conf.setReducerClass(CartesianReducer.class);

        conf.setInputFormat(CartesianInputFormat.class);
        CartesianInputFormat.setLeftInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]);
        CartesianInputFormat.setRightInputInfo(conf, SequenceFileInputFormat.class, otherArgs[0]);

        TextOutputFormat.setOutputPath(conf, new Path(otherArgs[1]));

        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(IntWritable.class);

        RunningJob job = JobClient.runJob(conf);
        while (!job.isComplete()) {
            Thread.sleep(1000);
        }

        long finish = System.currentTimeMillis();

        System.out.println("Time in ms: " + (finish - start));

        System.exit(job.isSuccessful() ? 0 : 2);
    }

}