Example usage for org.apache.hadoop.mapred JobConf setInt

List of usage examples for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:DistribCountingDriver.java

License:Apache License

public int run(String args[]) throws Exception {
    long job_start_time, job_end_time;
    long job_runtime;

    JobConf conf = new JobConf(getConf());

    int minFreqPercent = Integer.parseInt(args[0]);
    int datasetSize = Integer.parseInt(args[1]);
    conf.setInt("DISTRCOUNT.datasetSize", datasetSize);
    conf.setInt("DISTRCOUNT.minFreqPercent", minFreqPercent);

    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setJarByClass(DistribCountingDriver.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(DistribCountingMapper.class);
    conf.setCombinerClass(DistribCountingCombiner.class);
    conf.setReducerClass(DistribCountingReducer.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(conf, new Path(args[2]));
    FileOutputFormat.setOutputPath(conf, new Path(args[3]));

    job_start_time = System.currentTimeMillis();
    JobClient.runJob(conf);/*from  w w  w  .jav  a2 s  . co  m*/
    job_end_time = System.currentTimeMillis();

    job_runtime = (job_end_time - job_start_time) / 1000;

    System.out.println("total job runtime (seconds): " + job_runtime);

    return 0;
}

From source file:MRDriver.java

License:Apache License

public int run(String args[]) throws Exception {
    FileSystem fs = null;/*w w w  .  j a  va  2 s . c o  m*/
    Path samplesMapPath = null;

    float epsilon = Float.parseFloat(args[0]);
    double delta = Double.parseDouble(args[1]);
    int minFreqPercent = Integer.parseInt(args[2]);
    int d = Integer.parseInt(args[3]);
    int datasetSize = Integer.parseInt(args[4]);
    int numSamples = Integer.parseInt(args[5]);
    double phi = Double.parseDouble(args[6]);
    Random rand;

    /************************ Job 1 (local FIM) Configuration ************************/

    JobConf conf = new JobConf(getConf());

    /*
     * Compute the number of required "votes" for an itemsets to be
     * declared frequent    
     */
    // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2.
    int reqApproxNum = (int) Math
            .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1;
    int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi)));
    //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum);

    conf.setInt("PARMM.reducersNum", numSamples);
    conf.setInt("PARMM.datasetSize", datasetSize);
    conf.setInt("PARMM.minFreqPercent", minFreqPercent);
    conf.setInt("PARMM.sampleSize", sampleSize);
    conf.setFloat("PARMM.epsilon", epsilon);

    // Set the number of reducers equal to the number of samples, to
    // maximize parallelism. Required by our Partitioner.
    conf.setNumReduceTasks(numSamples);

    // XXX: why do we disable the speculative execution? MR
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    /* 
     * Enable compression of map output.
     *
     * We do it for this job and not for the aggregation one because
     * each mapper there only print out one record for each itemset,
     * so there isn't much to compress, I'd say. MR
     *
     * In Amazon MapReduce compression of the map output seems to be
     * happen by default and the Snappy codec is used, which is
     * extremely fast.
     */
    conf.setBoolean("mapred.compress.map.output", true);
    //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class);

    conf.setJarByClass(MRDriver.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    // We write the collections found in a reducers as a SequenceFile 
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9]));

    // set the mapper class based on command line option
    switch (Integer.parseInt(args[7])) {
    case 1:
        System.out.println("running partition mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(PartitionMapper.class);
        break;
    case 2:
        System.out.println("running binomial mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(BinomialSamplerMapper.class);
        break;
    case 3:
        System.out.println("running coin mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(CoinFlipSamplerMapper.class);
    case 4:
        System.out.println("running sampler mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(InputSamplerMapper.class);

        // create a random sample of size T*m
        rand = new Random();
        long sampling_start_time = System.nanoTime();
        int[] samples = new int[numSamples * sampleSize];
        for (int i = 0; i < numSamples * sampleSize; i++) {
            samples[i] = rand.nextInt(datasetSize);
        }

        // for each key in the sample, create a list of all T samples to which this key belongs
        Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>();
        for (int i = 0; i < numSamples * sampleSize; i++) {
            ArrayList<IntWritable> sampleIDs = null;
            LongWritable key = new LongWritable(samples[i]);
            if (hashTable.containsKey(key))
                sampleIDs = hashTable.get(key);
            else
                sampleIDs = new ArrayList<IntWritable>();
            sampleIDs.add(new IntWritable(i % numSamples));
            hashTable.put(key, sampleIDs);
        }

        /*
         * Convert the Hastable to a MapWritable which we will
         * write to HDFS and distribute to all Mappers using
         * DistributedCache
         */
        MapWritable map = new MapWritable();
        for (LongWritable key : hashTable.keySet()) {
            ArrayList<IntWritable> sampleIDs = hashTable.get(key);
            IntArrayWritable sampleIDsIAW = new IntArrayWritable();
            sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()]));
            map.put(key, sampleIDsIAW);
        }

        fs = FileSystem.get(URI.create("samplesMap.ser"), conf);
        samplesMapPath = new Path("samplesMap.ser");
        FSDataOutputStream out = fs.create(samplesMapPath, true);
        map.write(out);
        out.sync();
        out.close();
        DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"),
                conf);
        // stop the sampling timer   
        long sampling_end_time = System.nanoTime();
        long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000;
        System.out.println("sampling runtime (milliseconds): " + sampling_runtime);
        break; // end switch case
    case 5:
        System.out.println("running random integer partition mapper...");
        conf.setInputFormat(WholeSplitInputFormat.class);
        Path inputFilePath = new Path(args[8]);
        WholeSplitInputFormat.addInputPath(conf, inputFilePath);
        conf.setMapperClass(RandIntPartSamplerMapper.class);
        // Compute number of map tasks.
        fs = inputFilePath.getFileSystem(conf);
        FileStatus inputFileStatus = fs.getFileStatus(inputFilePath);
        long len = inputFileStatus.getLen();
        long blockSize = inputFileStatus.getBlockSize();
        conf.setLong("mapred.min.split.size", blockSize);
        conf.setLong("mapred.max.split.size", blockSize);
        int mapTasksNum = ((int) (len / blockSize)) + 1;
        conf.setNumMapTasks(mapTasksNum);
        //System.out.println("len: " + len + " blockSize: " 
        //      + blockSize + " mapTasksNum: " + mapTasksNum);
        // Extract random integer partition of total sample
        // size into up to mapTasksNum partitions.
        // XXX I'm not sure this is a correct way to do
        // it.
        rand = new Random();
        IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples];
        for (int j = 0; j < numSamples; j++) {
            IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum];
            int sum = 0;
            int i;
            for (i = 0; i < mapTasksNum - 1; i++) {
                int size = rand.nextInt(sampleSize - sum);
                tempToSampleArr[i] = new IntWritable(size);
                sum += size;
                if (sum > numSamples * sampleSize) {
                    System.out.println("Something went wrong generating the sample Sizes");
                    System.exit(1);
                }
                if (sum == sampleSize) {
                    break;
                }
            }
            if (i == mapTasksNum - 1) {
                tempToSampleArr[i] = new IntWritable(sampleSize - sum);
            } else {
                for (; i < mapTasksNum; i++) {
                    tempToSampleArr[i] = new IntWritable(0);
                }
            }
            Collections.shuffle(Arrays.asList(tempToSampleArr));
            for (i = 0; i < mapTasksNum; i++) {
                toSampleArr[i][j] = tempToSampleArr[i];
            }
        }

        for (int i = 0; i < mapTasksNum; i++) {
            DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i);
        }
        break;
    default:
        System.err.println("Wrong Mapper ID. Can only be in [1,5]");
        System.exit(1);
        break;
    }

    /*
     * We don't use the default hash partitioner because we want to
     * maximize the parallelism. That's why we also fix the number
     * of reducers.
     */
    conf.setPartitionerClass(FIMPartitioner.class);

    conf.setReducerClass(FIMReducer.class);

    /************************ Job 2 (aggregation) Configuration ************************/

    JobConf confAggr = new JobConf(getConf());

    confAggr.setInt("PARMM.reducersNum", numSamples);
    confAggr.setInt("PARMM.reqApproxNum", reqApproxNum);
    confAggr.setInt("PARMM.sampleSize", sampleSize);
    confAggr.setFloat("PARMM.epsilon", epsilon);

    // XXX: Why do we disable speculative execution? MR
    confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    confAggr.setJarByClass(MRDriver.class);

    confAggr.setMapOutputKeyClass(Text.class);
    confAggr.setMapOutputValueClass(DoubleWritable.class);

    confAggr.setOutputKeyClass(Text.class);
    confAggr.setOutputValueClass(Text.class);

    confAggr.setMapperClass(AggregateMapper.class);
    confAggr.setReducerClass(AggregateReducer.class);

    confAggr.setInputFormat(CombineSequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9]));

    FileOutputFormat.setOutputPath(confAggr, new Path(args[10]));

    long FIMjob_start_time = System.currentTimeMillis();
    RunningJob FIMjob = JobClient.runJob(conf);
    long FIMjob_end_time = System.currentTimeMillis();

    RunningJob aggregateJob = JobClient.runJob(confAggr);
    long aggrJob_end_time = System.currentTimeMillis();

    long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time;

    long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time;

    if (args[7].equals("4")) {
        // Remove samplesMap file 
        fs.delete(samplesMapPath, false);
    }

    Counters counters = FIMjob.getCounters();
    Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart");
    long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()];
    int i = 0;
    for (Counters.Counter counter : FIMMapperStartTimesCounters) {
        FIMMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd");
    long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMMapperEndTimesCounters) {
        FIMMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart");
    long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerStartTimesCounters) {
        FIMReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd");
    long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerEndTimesCounters) {
        FIMReducerEndTimes[i++] = counter.getCounter();
    }

    Counters countersAggr = aggregateJob.getCounters();
    Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart");
    long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperStartTimesCounters) {
        AggregateMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd");
    long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperEndTimesCounters) {
        AggregateMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart");
    long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerStartTimesCounters) {
        AggregateReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd");
    long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerEndTimesCounters) {
        AggregateReducerEndTimes[i++] = counter.getCounter();
    }

    long FIMMapperStartMin = FIMMapperStartTimes[0];
    for (long l : FIMMapperStartTimes) {
        if (l < FIMMapperStartMin) {
            FIMMapperStartMin = l;
        }
    }
    long FIMMapperEndMax = FIMMapperEndTimes[0];
    for (long l : FIMMapperEndTimes) {
        if (l > FIMMapperEndMax) {
            FIMMapperEndMax = l;
        }
    }
    System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time));
    System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin));
    long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length];
    long FIMMapperRunTimesSum = 0;
    for (int l = 0; l < FIMMapperStartTimes.length; l++) {
        FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l];
        FIMMapperRunTimesSum += FIMMapperRunTimes[l];
    }
    System.out.println("FIMMapper average task runtime (milliseconds): "
            + FIMMapperRunTimesSum / FIMMapperStartTimes.length);
    long FIMMapperRunTimesMin = FIMMapperRunTimes[0];
    long FIMMapperRunTimesMax = FIMMapperRunTimes[0];
    for (long l : FIMMapperRunTimes) {
        if (l < FIMMapperRunTimesMin) {
            FIMMapperRunTimesMin = l;
        }
        if (l > FIMMapperRunTimesMax) {
            FIMMapperRunTimesMax = l;
        }
    }
    System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin);
    System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax);

    long FIMReducerStartMin = FIMReducerStartTimes[0];
    for (long l : FIMReducerStartTimes) {
        if (l < FIMReducerStartMin) {
            FIMReducerStartMin = l;
        }
    }
    long FIMReducerEndMax = FIMReducerEndTimes[0];
    for (long l : FIMReducerEndTimes) {
        if (l > FIMReducerEndMax) {
            FIMReducerEndMax = l;
        }
    }
    System.out
            .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax));
    System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin));
    long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length];
    long FIMReducerRunTimesSum = 0;
    for (int l = 0; l < FIMReducerStartTimes.length; l++) {
        FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l];
        FIMReducerRunTimesSum += FIMReducerRunTimes[l];
    }
    System.out.println("FIMReducer average task runtime (milliseconds): "
            + FIMReducerRunTimesSum / FIMReducerStartTimes.length);
    long FIMReducerRunTimesMin = FIMReducerRunTimes[0];
    long FIMReducerRunTimesMax = FIMReducerRunTimes[0];
    for (long l : FIMReducerRunTimes) {
        if (l < FIMReducerRunTimesMin) {
            FIMReducerRunTimesMin = l;
        }
        if (l > FIMReducerRunTimesMax) {
            FIMReducerRunTimesMax = l;
        }
    }
    System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin);
    System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax);
    System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax));

    long AggregateMapperStartMin = AggregateMapperStartTimes[0];
    for (long l : AggregateMapperStartTimes) {
        if (l < AggregateMapperStartMin) {
            AggregateMapperStartMin = l;
        }
    }
    long AggregateMapperEndMax = AggregateMapperEndTimes[0];
    for (long l : AggregateMapperEndTimes) {
        if (l > AggregateMapperEndMax) {
            AggregateMapperEndMax = l;
        }
    }
    System.out.println(
            "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time));
    System.out.println("AggregateMapper total runtime (milliseconds): "
            + (AggregateMapperEndMax - AggregateMapperStartMin));
    long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length];
    long AggregateMapperRunTimesSum = 0;
    for (int l = 0; l < AggregateMapperStartTimes.length; l++) {
        AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l];
        AggregateMapperRunTimesSum += AggregateMapperRunTimes[l];
    }
    System.out.println("AggregateMapper average task runtime (milliseconds): "
            + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length);
    long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0];
    long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0];
    for (long l : AggregateMapperRunTimes) {
        if (l < AggregateMapperRunTimesMin) {
            AggregateMapperRunTimesMin = l;
        }
        if (l > AggregateMapperRunTimesMax) {
            AggregateMapperRunTimesMax = l;
        }
    }
    System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin);
    System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax);

    long AggregateReducerStartMin = AggregateReducerStartTimes[0];
    for (long l : AggregateReducerStartTimes) {
        if (l < AggregateReducerStartMin) {
            AggregateReducerStartMin = l;
        }
    }
    long AggregateReducerEndMax = AggregateReducerEndTimes[0];
    for (long l : AggregateReducerEndTimes) {
        if (l > AggregateReducerEndMax) {
            AggregateReducerEndMax = l;
        }
    }
    System.out.println("Aggregate job round shuffle phase runtime (milliseconds): "
            + (AggregateReducerStartMin - AggregateMapperEndMax));
    System.out.println("AggregateReducer total runtime (milliseconds): "
            + (AggregateReducerEndMax - AggregateReducerStartMin));
    long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length];
    long AggregateReducerRunTimesSum = 0;
    for (int l = 0; l < AggregateReducerStartTimes.length; l++) {
        AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l];
        AggregateReducerRunTimesSum += AggregateReducerRunTimes[l];
    }
    System.out.println("AggregateReducer average task runtime (milliseconds): "
            + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length);
    long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0];
    long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0];
    for (long l : AggregateReducerRunTimes) {
        if (l < AggregateReducerRunTimesMin) {
            AggregateReducerRunTimesMin = l;
        }
        if (l > AggregateReducerRunTimesMax) {
            AggregateReducerRunTimesMax = l;
        }
    }
    System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin);
    System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax);

    System.out.println(
            "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax));

    System.out
            .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time));
    System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): "
            + (AggregateReducerEndMax - FIMMapperStartMin));
    System.out.println("total runtime (no setups, no cooldowns) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin));
    System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime);
    System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin));
    System.out.println(
            "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime);
    System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): "
            + (AggregateReducerEndMax - AggregateMapperStartMin));

    return 0;
}

From source file:SleepJob.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), SleepJob.class);
    job.setNumMapTasks(numMapper);//www .j a v a 2  s.co m
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(SleepJob.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(SleepJob.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(SleepJob.class);
    job.setSpeculativeExecution(false);
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:NaivePageRank.java

License:Apache License

public static void main(String[] args) throws Exception {
    int iteration = -1;
    String inputPath = args[0];/*from w w  w. j av  a  2  s .  co  m*/
    String outputPath = args[1];
    int specIteration = 0;
    if (args.length > 2) {
        specIteration = Integer.parseInt(args[2]);
    }
    int numNodes = 100000;
    if (args.length > 3) {
        numNodes = Integer.parseInt(args[3]);
    }
    int numReducers = 32;
    if (args.length > 4) {
        numReducers = Integer.parseInt(args[4]);
    }
    System.out.println("specified iteration: " + specIteration);
    long start = System.currentTimeMillis();

    /**
     * job to count out-going links for each url
     */
    JobConf conf = new JobConf(NaivePageRank.class);
    conf.setJobName("PageRank-Count");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(CountMapper.class);
    conf.setReducerClass(CountReducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/count"));
    conf.setNumReduceTasks(numReducers);
    JobClient.runJob(conf);

    /******************** Initial Rank Assignment Job ***********************/
    conf = new JobConf(NaivePageRank.class);
    conf.setJobName("PageRank-Initialize");
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapperClass(InitialRankAssignmentMapper.class);
    conf.setReducerClass(InitialRankAssignmentReducer.class);
    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
    conf.setNumReduceTasks(numReducers);
    // conf.setIterative(false);
    JobClient.runJob(conf);
    iteration++;

    do {
        /****************** Join Job ********************************/
        conf = new JobConf(NaivePageRank.class);
        conf.setJobName("PageRank-Join");
        conf.setOutputKeyClass(Text.class);
        // conf.setOutputValueClass(Text.class);
        conf.setMapperClass(ComputeRankMap.class);
        conf.setReducerClass(ComputeRankReduce.class);
        conf.setMapOutputKeyClass(TextPair.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        conf.setPartitionerClass(FirstPartitioner.class);
        conf.setOutputKeyComparatorClass(KeyComparator.class);
        conf.setOutputValueGroupingComparator(GroupComparator.class);

        // relation table
        FileInputFormat.setInputPaths(conf, new Path(inputPath));
        // rank table
        FileInputFormat.addInputPath(conf, new Path(outputPath + "/i" + (iteration - 1)));
        // count table
        FileInputFormat.addInputPath(conf, new Path(outputPath + "/count"));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
        conf.setNumReduceTasks(numReducers);
        JobClient.runJob(conf);
        iteration++;

        /******************** Rank Aggregate Job ***********************/
        conf = new JobConf(NaivePageRank.class);
        conf.setJobName("PageRank-Aggregate");
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);
        conf.setMapOutputKeyClass(Text.class);
        conf.setMapperClass(RankAggregateMapper.class);
        conf.setReducerClass(RankAggregateReducer.class);
        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(TextOutputFormat.class);
        FileInputFormat.setInputPaths(conf, new Path(outputPath + "/i" + (iteration - 1)));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath + "/i" + iteration));
        conf.setNumReduceTasks(numReducers);
        conf.setInt("haloop.num.nodes", numNodes);
        JobClient.runJob(conf);
        iteration++;
    } while (iteration < 2 * specIteration);

    long end = System.currentTimeMillis();
    System.out.println("running time " + (end - start) / 1000 + "s");
}

From source file:RepackWikipedia.java

License:Apache License

@SuppressWarnings("static-access")
@Override//from   w  w w . j av  a2  s.  com
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output location")
            .create(OUTPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("mapping file")
            .create(MAPPING_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("block|record|none").hasArg()
            .withDescription("compression type").create(COMPRESSION_TYPE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de").hasArg().withDescription("two-letter language code")
            .create(LANGUAGE_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(MAPPING_FILE_OPTION) || !cmdline.hasOption(COMPRESSION_TYPE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_OPTION);
    String mappingFile = cmdline.getOptionValue(MAPPING_FILE_OPTION);
    String compressionType = cmdline.getOptionValue(COMPRESSION_TYPE_OPTION);

    if (!"block".equals(compressionType) && !"record".equals(compressionType)
            && !"none".equals(compressionType)) {
        System.err.println("Error: \"" + compressionType + "\" unknown compression type!");
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    // this is the default block size
    int blocksize = 1000000;

    //Job job = Job.getInstance(getConf());
    JobConf conf = new JobConf(RepackWikipedia.class);
    conf.setJarByClass(RepackWikipedia.class);
    conf.setJobName(String.format("RepackWikipedia[%s: %s, %s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_OPTION, outputPath, COMPRESSION_TYPE_OPTION, compressionType, LANGUAGE_OPTION, language));

    conf.set(DOCNO_MAPPING_FIELD, mappingFile);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - XML dump file: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - docno mapping data file: " + mappingFile);
    LOG.info(" - compression type: " + compressionType);
    LOG.info(" - language: " + language);

    if ("block".equals(compressionType)) {
        LOG.info(" - block size: " + blocksize);
    }

    conf.setNumReduceTasks(0);

    FileInputFormat.addInputPath(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    if ("none".equals(compressionType)) {
        FileOutputFormat.setCompressOutput(conf, false);
    } else {
        FileOutputFormat.setCompressOutput(conf, true);

        if ("record".equals(compressionType)) {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);
        } else {
            SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
            conf.setInt("io.seqfile.compress.blocksize", blocksize);
        }
    }

    if (language != null) {
        conf.set("wiki.language", language);
    }

    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WikipediaPage.class);

    conf.setMapperClass(MyMapper.class);

    // Delete the output directory if it exists already.
    FileSystem.get(getConf()).delete(new Path(outputPath), true);

    //job.waitForCompletion(true);
    JobClient.runJob(conf);

    return 0;
}

From source file:SleepJobWithArray.java

License:Apache License

public JobConf setupJobConf(int numMapper, int numReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount) {
    JobConf job = new JobConf(getConf(), SleepJobWithArray.class);
    job.setNumMapTasks(numMapper);//from w  w  w  .j  av a2s .com
    job.setNumReduceTasks(numReducer);
    job.setMapperClass(SleepJobWithArray.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setReducerClass(SleepJobWithArray.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setInputFormat(SleepInputFormat.class);
    job.setPartitionerClass(SleepJobWithArray.class);
    job.setSpeculativeExecution(false);
    FileInputFormat.addInputPath(job, new Path("ignored"));
    job.setLong("sleep.job.map.sleep.time", mapSleepTime);
    job.setLong("sleep.job.reduce.sleep.time", reduceSleepTime);
    job.setInt("sleep.job.map.sleep.count", mapSleepCount);
    job.setInt("sleep.job.reduce.sleep.count", reduceSleepCount);
    return job;
}

From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.cpu.MatrixMultiplicationCpu.java

License:Apache License

public static Configuration createMatrixMultiplicationCpuConf(Configuration initialConf, Path aPath, Path bPath,
        Path outPath, int outCardinality, boolean isDebugging) {

    JobConf conf = new JobConf(initialConf, MatrixMultiplicationCpu.class);
    conf.setJobName("MatrixMultiplicationCPU: " + aPath + " x " + bPath + " = " + outPath);

    conf.setInt(CONF_OUT_CARD, outCardinality);
    conf.setBoolean(CONF_DEBUG, isDebugging);

    conf.setInputFormat(CompositeInputFormat.class);
    conf.set("mapred.join.expr",
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath));

    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setMapperClass(MatrixMultiplyCpuMapper.class);
    conf.setCombinerClass(MatrixMultiplicationCpuReducer.class);
    conf.setReducerClass(MatrixMultiplicationCpuReducer.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(VectorWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(VectorWritable.class);

    // Increase client heap size
    conf.set("mapred.child.java.opts", "-Xms8G -Xmx8G");

    return conf;/*from  ww w . j  a  va 2  s.com*/
}

From source file:at.illecker.hadoop.rootbeer.examples.matrixmultiplication.gpu.MatrixMultiplicationGpu.java

License:Apache License

public static Configuration createMatrixMultiplicationGpuConf(Configuration initialConf, Path aPath, Path bPath,
        Path outPath, int outCardinality, int tileWidth, boolean isDebugging) {

    JobConf conf = new JobConf(initialConf, MatrixMultiplicationGpu.class);
    conf.setJobName("MatrixMultiplicationGPU: " + aPath + " x " + bPath + " = " + outPath);

    conf.setInt(CONF_OUT_CARD, outCardinality);
    conf.setInt(CONF_TILE_WIDTH, tileWidth);
    conf.setBoolean(CONF_DEBUG, isDebugging);

    conf.setInputFormat(CompositeInputFormat.class);
    conf.set("mapred.join.expr",
            CompositeInputFormat.compose("inner", SequenceFileInputFormat.class, aPath, bPath));

    conf.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setMapperClass(MatrixMultiplyGpuMapper.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(VectorWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(VectorWritable.class);

    // Increase client heap size for GPU Rootbeer execution
    conf.set("mapred.child.java.opts", "-Xms8G -Xmx8G");

    // No Reduce step is needed
    // -> 0 reducer means reduce step will be skipped and
    // mapper output will be the final out
    // -> Identity reducer means then shuffling/sorting will still take place
    conf.setNumReduceTasks(0);/*from ww w  .j a  v  a2 s  . c o  m*/

    return conf;
}

From source file:bixo.examples.webmining.DemoWebMiningWorkflow.java

License:Apache License

public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, DemoWebMiningOptions options) throws IOException, InterruptedException {

    // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
    // HTML only.

    // We want to extract the cleaned up HTML, and pass that to the parser, which will
    // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
    // any results.

    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    boolean isLocal = HadoopUtils.isJobLocal(conf);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
    Properties props = HadoopUtils.getDefaultProperties(DemoWebMiningWorkflow.class, false, conf);
    FileSystem fs = crawlDbPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }//from   w  w w . j a v  a2  s . c o m

    Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES),
            crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

    Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
    Pipe urlsToFetchPipe = splitter.getLHSPipe();

    // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
    // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
    // from high to low by links score.
    // TODO add unit test
    urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
    long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

    BaseScoreGenerator scorer = new LinkScoreGenerator();

    // Create the sub-assembly that runs the fetch job
    int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

    Pipe analyzerPipe = new Pipe("analyzer pipe");
    analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

    Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
    outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());

    Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
    resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

    // Group the finished datums, the skipped datums, status, outlinks
    Pipe updatePipe = new CoGroup("update pipe",
            Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
            Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                    new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
            null, new OuterJoin());
    updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

    // output : loop dir specific crawldb
    Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString());
    // Status, 
    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());
    // Content
    Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    // PageResults
    Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
    Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(updatePipe.getName(), crawlDbSink);
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(resultsPipe.getName(), resultsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

    return flow;
}

From source file:bixo.examples.webmining.WebMiningWorkflow.java

License:Apache License

public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, WebMiningOptions options, boolean resetSolr)
        throws IOException, InterruptedException {

    // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
    // HTML only.

    // We want to extract the cleaned up HTML, and pass that to the parser, which will
    // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
    // any results.

    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    boolean isLocal = HadoopUtils.isJobLocal(conf);
    int numReducers = 1; // we always want to use a single reducer, to avoid contention
    conf.setNumReduceTasks(numReducers);
    conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
    Properties props = HadoopUtils.getDefaultProperties(WebMiningWorkflow.class, false, conf);
    FileSystem fs = crawlDbPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }/*from  w w w  . j  a v  a 2s.c  om*/

    Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES),
            crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

    Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
    Pipe urlsToFetchPipe = splitter.getLHSPipe();

    // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
    // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
    // from high to low by links score.
    // TODO add unit test
    urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
    long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

    BaseScoreGenerator scorer = new LinkScoreGenerator();

    // Create the sub-assembly that runs the fetch job
    int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

    Pipe analyzerPipe = new Pipe("analyzer pipe");
    analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

    Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
    outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());

    Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
    resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

    // Group the finished datums, the skipped datums, status, outlinks
    Pipe updatePipe = new CoGroup("update pipe",
            Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
            Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                    new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
            null, new OuterJoin());
    updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

    // output : loop dir specific crawldb
    Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString());
    // Status, 
    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());
    // Content
    Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    // PageResults
    Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
    Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(updatePipe.getName(), crawlDbSink);
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(resultsPipe.getName(), resultsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

    return flow;
}