Example usage for org.apache.hadoop.mapred JobConf setFloat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setFloat.

Prototype

public void setFloat(String name, float value)

Source Link

Document

Set the value of the name property to a float.

Usage

From source file:MRDriver.java

License:Apache License

public int run(String args[]) throws Exception {
    FileSystem fs = null;/*from  w w  w  . j a  v  a  2s.c  o  m*/
    Path samplesMapPath = null;

    float epsilon = Float.parseFloat(args[0]);
    double delta = Double.parseDouble(args[1]);
    int minFreqPercent = Integer.parseInt(args[2]);
    int d = Integer.parseInt(args[3]);
    int datasetSize = Integer.parseInt(args[4]);
    int numSamples = Integer.parseInt(args[5]);
    double phi = Double.parseDouble(args[6]);
    Random rand;

    /************************ Job 1 (local FIM) Configuration ************************/

    JobConf conf = new JobConf(getConf());

    /*
     * Compute the number of required "votes" for an itemsets to be
     * declared frequent    
     */
    // The +1 at the end is needed to ensure reqApproxNum > numsamples / 2.
    int reqApproxNum = (int) Math
            .floor((numSamples * (1 - phi)) - Math.sqrt(numSamples * (1 - phi) * 2 * Math.log(1 / delta))) + 1;
    int sampleSize = (int) Math.ceil((2 / Math.pow(epsilon, 2)) * (d + Math.log(1 / phi)));
    //System.out.println("reducersNum: " + numSamples + " reqApproxNum: " + reqApproxNum);

    conf.setInt("PARMM.reducersNum", numSamples);
    conf.setInt("PARMM.datasetSize", datasetSize);
    conf.setInt("PARMM.minFreqPercent", minFreqPercent);
    conf.setInt("PARMM.sampleSize", sampleSize);
    conf.setFloat("PARMM.epsilon", epsilon);

    // Set the number of reducers equal to the number of samples, to
    // maximize parallelism. Required by our Partitioner.
    conf.setNumReduceTasks(numSamples);

    // XXX: why do we disable the speculative execution? MR
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    /* 
     * Enable compression of map output.
     *
     * We do it for this job and not for the aggregation one because
     * each mapper there only print out one record for each itemset,
     * so there isn't much to compress, I'd say. MR
     *
     * In Amazon MapReduce compression of the map output seems to be
     * happen by default and the Snappy codec is used, which is
     * extremely fast.
     */
    conf.setBoolean("mapred.compress.map.output", true);
    //conf.setMapOutputCompressorClass(com.hadoop.compression.lzo.LzoCodec.class);

    conf.setJarByClass(MRDriver.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    // We write the collections found in a reducers as a SequenceFile 
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(args[9]));

    // set the mapper class based on command line option
    switch (Integer.parseInt(args[7])) {
    case 1:
        System.out.println("running partition mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(PartitionMapper.class);
        break;
    case 2:
        System.out.println("running binomial mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(BinomialSamplerMapper.class);
        break;
    case 3:
        System.out.println("running coin mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(CoinFlipSamplerMapper.class);
    case 4:
        System.out.println("running sampler mapper...");
        SequenceFileInputFormat.addInputPath(conf, new Path(args[8]));
        conf.setMapperClass(InputSamplerMapper.class);

        // create a random sample of size T*m
        rand = new Random();
        long sampling_start_time = System.nanoTime();
        int[] samples = new int[numSamples * sampleSize];
        for (int i = 0; i < numSamples * sampleSize; i++) {
            samples[i] = rand.nextInt(datasetSize);
        }

        // for each key in the sample, create a list of all T samples to which this key belongs
        Hashtable<LongWritable, ArrayList<IntWritable>> hashTable = new Hashtable<LongWritable, ArrayList<IntWritable>>();
        for (int i = 0; i < numSamples * sampleSize; i++) {
            ArrayList<IntWritable> sampleIDs = null;
            LongWritable key = new LongWritable(samples[i]);
            if (hashTable.containsKey(key))
                sampleIDs = hashTable.get(key);
            else
                sampleIDs = new ArrayList<IntWritable>();
            sampleIDs.add(new IntWritable(i % numSamples));
            hashTable.put(key, sampleIDs);
        }

        /*
         * Convert the Hastable to a MapWritable which we will
         * write to HDFS and distribute to all Mappers using
         * DistributedCache
         */
        MapWritable map = new MapWritable();
        for (LongWritable key : hashTable.keySet()) {
            ArrayList<IntWritable> sampleIDs = hashTable.get(key);
            IntArrayWritable sampleIDsIAW = new IntArrayWritable();
            sampleIDsIAW.set(sampleIDs.toArray(new IntWritable[sampleIDs.size()]));
            map.put(key, sampleIDsIAW);
        }

        fs = FileSystem.get(URI.create("samplesMap.ser"), conf);
        samplesMapPath = new Path("samplesMap.ser");
        FSDataOutputStream out = fs.create(samplesMapPath, true);
        map.write(out);
        out.sync();
        out.close();
        DistributedCache.addCacheFile(new URI(fs.getWorkingDirectory() + "/samplesMap.ser#samplesMap.ser"),
                conf);
        // stop the sampling timer   
        long sampling_end_time = System.nanoTime();
        long sampling_runtime = (sampling_end_time - sampling_start_time) / 1000000;
        System.out.println("sampling runtime (milliseconds): " + sampling_runtime);
        break; // end switch case
    case 5:
        System.out.println("running random integer partition mapper...");
        conf.setInputFormat(WholeSplitInputFormat.class);
        Path inputFilePath = new Path(args[8]);
        WholeSplitInputFormat.addInputPath(conf, inputFilePath);
        conf.setMapperClass(RandIntPartSamplerMapper.class);
        // Compute number of map tasks.
        fs = inputFilePath.getFileSystem(conf);
        FileStatus inputFileStatus = fs.getFileStatus(inputFilePath);
        long len = inputFileStatus.getLen();
        long blockSize = inputFileStatus.getBlockSize();
        conf.setLong("mapred.min.split.size", blockSize);
        conf.setLong("mapred.max.split.size", blockSize);
        int mapTasksNum = ((int) (len / blockSize)) + 1;
        conf.setNumMapTasks(mapTasksNum);
        //System.out.println("len: " + len + " blockSize: " 
        //      + blockSize + " mapTasksNum: " + mapTasksNum);
        // Extract random integer partition of total sample
        // size into up to mapTasksNum partitions.
        // XXX I'm not sure this is a correct way to do
        // it.
        rand = new Random();
        IntWritable[][] toSampleArr = new IntWritable[mapTasksNum][numSamples];
        for (int j = 0; j < numSamples; j++) {
            IntWritable[] tempToSampleArr = new IntWritable[mapTasksNum];
            int sum = 0;
            int i;
            for (i = 0; i < mapTasksNum - 1; i++) {
                int size = rand.nextInt(sampleSize - sum);
                tempToSampleArr[i] = new IntWritable(size);
                sum += size;
                if (sum > numSamples * sampleSize) {
                    System.out.println("Something went wrong generating the sample Sizes");
                    System.exit(1);
                }
                if (sum == sampleSize) {
                    break;
                }
            }
            if (i == mapTasksNum - 1) {
                tempToSampleArr[i] = new IntWritable(sampleSize - sum);
            } else {
                for (; i < mapTasksNum; i++) {
                    tempToSampleArr[i] = new IntWritable(0);
                }
            }
            Collections.shuffle(Arrays.asList(tempToSampleArr));
            for (i = 0; i < mapTasksNum; i++) {
                toSampleArr[i][j] = tempToSampleArr[i];
            }
        }

        for (int i = 0; i < mapTasksNum; i++) {
            DefaultStringifier.storeArray(conf, toSampleArr[i], "PARMM.toSampleArr_" + i);
        }
        break;
    default:
        System.err.println("Wrong Mapper ID. Can only be in [1,5]");
        System.exit(1);
        break;
    }

    /*
     * We don't use the default hash partitioner because we want to
     * maximize the parallelism. That's why we also fix the number
     * of reducers.
     */
    conf.setPartitionerClass(FIMPartitioner.class);

    conf.setReducerClass(FIMReducer.class);

    /************************ Job 2 (aggregation) Configuration ************************/

    JobConf confAggr = new JobConf(getConf());

    confAggr.setInt("PARMM.reducersNum", numSamples);
    confAggr.setInt("PARMM.reqApproxNum", reqApproxNum);
    confAggr.setInt("PARMM.sampleSize", sampleSize);
    confAggr.setFloat("PARMM.epsilon", epsilon);

    // XXX: Why do we disable speculative execution? MR
    confAggr.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    confAggr.setInt("mapred.task.timeout", MR_TIMEOUT_MILLI);

    confAggr.setJarByClass(MRDriver.class);

    confAggr.setMapOutputKeyClass(Text.class);
    confAggr.setMapOutputValueClass(DoubleWritable.class);

    confAggr.setOutputKeyClass(Text.class);
    confAggr.setOutputValueClass(Text.class);

    confAggr.setMapperClass(AggregateMapper.class);
    confAggr.setReducerClass(AggregateReducer.class);

    confAggr.setInputFormat(CombineSequenceFileInputFormat.class);
    SequenceFileInputFormat.addInputPath(confAggr, new Path(args[9]));

    FileOutputFormat.setOutputPath(confAggr, new Path(args[10]));

    long FIMjob_start_time = System.currentTimeMillis();
    RunningJob FIMjob = JobClient.runJob(conf);
    long FIMjob_end_time = System.currentTimeMillis();

    RunningJob aggregateJob = JobClient.runJob(confAggr);
    long aggrJob_end_time = System.currentTimeMillis();

    long FIMjob_runtime = FIMjob_end_time - FIMjob_start_time;

    long aggrJob_runtime = aggrJob_end_time - FIMjob_end_time;

    if (args[7].equals("4")) {
        // Remove samplesMap file 
        fs.delete(samplesMapPath, false);
    }

    Counters counters = FIMjob.getCounters();
    Counters.Group FIMMapperStartTimesCounters = counters.getGroup("FIMMapperStart");
    long[] FIMMapperStartTimes = new long[FIMMapperStartTimesCounters.size()];
    int i = 0;
    for (Counters.Counter counter : FIMMapperStartTimesCounters) {
        FIMMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMMapperEndTimesCounters = counters.getGroup("FIMMapperEnd");
    long[] FIMMapperEndTimes = new long[FIMMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMMapperEndTimesCounters) {
        FIMMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerStartTimesCounters = counters.getGroup("FIMReducerStart");
    long[] FIMReducerStartTimes = new long[FIMReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerStartTimesCounters) {
        FIMReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group FIMReducerEndTimesCounters = counters.getGroup("FIMReducerEnd");
    long[] FIMReducerEndTimes = new long[FIMReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : FIMReducerEndTimesCounters) {
        FIMReducerEndTimes[i++] = counter.getCounter();
    }

    Counters countersAggr = aggregateJob.getCounters();
    Counters.Group AggregateMapperStartTimesCounters = countersAggr.getGroup("AggregateMapperStart");
    long[] AggregateMapperStartTimes = new long[AggregateMapperStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperStartTimesCounters) {
        AggregateMapperStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateMapperEndTimesCounters = countersAggr.getGroup("AggregateMapperEnd");
    long[] AggregateMapperEndTimes = new long[AggregateMapperEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateMapperEndTimesCounters) {
        AggregateMapperEndTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerStartTimesCounters = countersAggr.getGroup("AggregateReducerStart");
    long[] AggregateReducerStartTimes = new long[AggregateReducerStartTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerStartTimesCounters) {
        AggregateReducerStartTimes[i++] = counter.getCounter();
    }

    Counters.Group AggregateReducerEndTimesCounters = countersAggr.getGroup("AggregateReducerEnd");
    long[] AggregateReducerEndTimes = new long[AggregateReducerEndTimesCounters.size()];
    i = 0;
    for (Counters.Counter counter : AggregateReducerEndTimesCounters) {
        AggregateReducerEndTimes[i++] = counter.getCounter();
    }

    long FIMMapperStartMin = FIMMapperStartTimes[0];
    for (long l : FIMMapperStartTimes) {
        if (l < FIMMapperStartMin) {
            FIMMapperStartMin = l;
        }
    }
    long FIMMapperEndMax = FIMMapperEndTimes[0];
    for (long l : FIMMapperEndTimes) {
        if (l > FIMMapperEndMax) {
            FIMMapperEndMax = l;
        }
    }
    System.out.println("FIM job setup time (milliseconds): " + (FIMMapperStartMin - FIMjob_start_time));
    System.out.println("FIMMapper total runtime (milliseconds): " + (FIMMapperEndMax - FIMMapperStartMin));
    long[] FIMMapperRunTimes = new long[FIMMapperStartTimes.length];
    long FIMMapperRunTimesSum = 0;
    for (int l = 0; l < FIMMapperStartTimes.length; l++) {
        FIMMapperRunTimes[l] = FIMMapperEndTimes[l] - FIMMapperStartTimes[l];
        FIMMapperRunTimesSum += FIMMapperRunTimes[l];
    }
    System.out.println("FIMMapper average task runtime (milliseconds): "
            + FIMMapperRunTimesSum / FIMMapperStartTimes.length);
    long FIMMapperRunTimesMin = FIMMapperRunTimes[0];
    long FIMMapperRunTimesMax = FIMMapperRunTimes[0];
    for (long l : FIMMapperRunTimes) {
        if (l < FIMMapperRunTimesMin) {
            FIMMapperRunTimesMin = l;
        }
        if (l > FIMMapperRunTimesMax) {
            FIMMapperRunTimesMax = l;
        }
    }
    System.out.println("FIMMapper minimum task runtime (milliseconds): " + FIMMapperRunTimesMin);
    System.out.println("FIMMapper maximum task runtime (milliseconds): " + FIMMapperRunTimesMax);

    long FIMReducerStartMin = FIMReducerStartTimes[0];
    for (long l : FIMReducerStartTimes) {
        if (l < FIMReducerStartMin) {
            FIMReducerStartMin = l;
        }
    }
    long FIMReducerEndMax = FIMReducerEndTimes[0];
    for (long l : FIMReducerEndTimes) {
        if (l > FIMReducerEndMax) {
            FIMReducerEndMax = l;
        }
    }
    System.out
            .println("FIM job shuffle phase runtime (milliseconds): " + (FIMReducerStartMin - FIMMapperEndMax));
    System.out.println("FIMReducer total runtime (milliseconds): " + (FIMReducerEndMax - FIMReducerStartMin));
    long[] FIMReducerRunTimes = new long[FIMReducerStartTimes.length];
    long FIMReducerRunTimesSum = 0;
    for (int l = 0; l < FIMReducerStartTimes.length; l++) {
        FIMReducerRunTimes[l] = FIMReducerEndTimes[l] - FIMReducerStartTimes[l];
        FIMReducerRunTimesSum += FIMReducerRunTimes[l];
    }
    System.out.println("FIMReducer average task runtime (milliseconds): "
            + FIMReducerRunTimesSum / FIMReducerStartTimes.length);
    long FIMReducerRunTimesMin = FIMReducerRunTimes[0];
    long FIMReducerRunTimesMax = FIMReducerRunTimes[0];
    for (long l : FIMReducerRunTimes) {
        if (l < FIMReducerRunTimesMin) {
            FIMReducerRunTimesMin = l;
        }
        if (l > FIMReducerRunTimesMax) {
            FIMReducerRunTimesMax = l;
        }
    }
    System.out.println("FIMReducer minimum task runtime (milliseconds): " + FIMReducerRunTimesMin);
    System.out.println("FIMReducer maximum task runtime (milliseconds): " + FIMReducerRunTimesMax);
    System.out.println("FIM job cooldown time (milliseconds): " + (FIMjob_end_time - FIMReducerEndMax));

    long AggregateMapperStartMin = AggregateMapperStartTimes[0];
    for (long l : AggregateMapperStartTimes) {
        if (l < AggregateMapperStartMin) {
            AggregateMapperStartMin = l;
        }
    }
    long AggregateMapperEndMax = AggregateMapperEndTimes[0];
    for (long l : AggregateMapperEndTimes) {
        if (l > AggregateMapperEndMax) {
            AggregateMapperEndMax = l;
        }
    }
    System.out.println(
            "Aggregation job setup time (milliseconds): " + (AggregateMapperStartMin - FIMjob_end_time));
    System.out.println("AggregateMapper total runtime (milliseconds): "
            + (AggregateMapperEndMax - AggregateMapperStartMin));
    long[] AggregateMapperRunTimes = new long[AggregateMapperStartTimes.length];
    long AggregateMapperRunTimesSum = 0;
    for (int l = 0; l < AggregateMapperStartTimes.length; l++) {
        AggregateMapperRunTimes[l] = AggregateMapperEndTimes[l] - AggregateMapperStartTimes[l];
        AggregateMapperRunTimesSum += AggregateMapperRunTimes[l];
    }
    System.out.println("AggregateMapper average task runtime (milliseconds): "
            + AggregateMapperRunTimesSum / AggregateMapperStartTimes.length);
    long AggregateMapperRunTimesMin = AggregateMapperRunTimes[0];
    long AggregateMapperRunTimesMax = AggregateMapperRunTimes[0];
    for (long l : AggregateMapperRunTimes) {
        if (l < AggregateMapperRunTimesMin) {
            AggregateMapperRunTimesMin = l;
        }
        if (l > AggregateMapperRunTimesMax) {
            AggregateMapperRunTimesMax = l;
        }
    }
    System.out.println("AggregateMapper minimum task runtime (milliseconds): " + AggregateMapperRunTimesMin);
    System.out.println("AggregateMapper maximum task runtime (milliseconds): " + AggregateMapperRunTimesMax);

    long AggregateReducerStartMin = AggregateReducerStartTimes[0];
    for (long l : AggregateReducerStartTimes) {
        if (l < AggregateReducerStartMin) {
            AggregateReducerStartMin = l;
        }
    }
    long AggregateReducerEndMax = AggregateReducerEndTimes[0];
    for (long l : AggregateReducerEndTimes) {
        if (l > AggregateReducerEndMax) {
            AggregateReducerEndMax = l;
        }
    }
    System.out.println("Aggregate job round shuffle phase runtime (milliseconds): "
            + (AggregateReducerStartMin - AggregateMapperEndMax));
    System.out.println("AggregateReducer total runtime (milliseconds): "
            + (AggregateReducerEndMax - AggregateReducerStartMin));
    long[] AggregateReducerRunTimes = new long[AggregateReducerStartTimes.length];
    long AggregateReducerRunTimesSum = 0;
    for (int l = 0; l < AggregateReducerStartTimes.length; l++) {
        AggregateReducerRunTimes[l] = AggregateReducerEndTimes[l] - AggregateReducerStartTimes[l];
        AggregateReducerRunTimesSum += AggregateReducerRunTimes[l];
    }
    System.out.println("AggregateReducer average task runtime (milliseconds): "
            + AggregateReducerRunTimesSum / AggregateReducerStartTimes.length);
    long AggregateReducerRunTimesMin = AggregateReducerRunTimes[0];
    long AggregateReducerRunTimesMax = AggregateReducerRunTimes[0];
    for (long l : AggregateReducerRunTimes) {
        if (l < AggregateReducerRunTimesMin) {
            AggregateReducerRunTimesMin = l;
        }
        if (l > AggregateReducerRunTimesMax) {
            AggregateReducerRunTimesMax = l;
        }
    }
    System.out.println("AggregateReducer minimum task runtime (milliseconds): " + AggregateReducerRunTimesMin);
    System.out.println("AggregateReducer maximum task runtime (milliseconds): " + AggregateReducerRunTimesMax);

    System.out.println(
            "Aggregation job cooldown time (milliseconds): " + (aggrJob_end_time - AggregateReducerEndMax));

    System.out
            .println("total runtime (all inclusive) (milliseconds): " + (aggrJob_end_time - FIMjob_start_time));
    System.out.println("total runtime (no FIM job setup, no aggregation job cooldown) (milliseconds): "
            + (AggregateReducerEndMax - FIMMapperStartMin));
    System.out.println("total runtime (no setups, no cooldowns) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin + AggregateReducerEndMax - AggregateMapperStartMin));
    System.out.println("FIM job runtime (including setup and cooldown) (milliseconds): " + FIMjob_runtime);
    System.out.println("FIM job runtime (no setup, no cooldown) (milliseconds): "
            + (FIMReducerEndMax - FIMMapperStartMin));
    System.out.println(
            "Aggregation job runtime (including setup and cooldown) (milliseconds): " + aggrJob_runtime);
    System.out.println("Aggregation job runtime (no setup, no cooldown) (milliseconds): "
            + (AggregateReducerEndMax - AggregateMapperStartMin));

    return 0;
}

From source file:Brush.BrushConfig.java

License:Apache License

public static void initializeConfiguration(JobConf conf) {
    validateConfiguration();//ww  w .j  ava  2 s  .  co m

    conf.setNumMapTasks(HADOOP_MAPPERS);
    conf.setNumReduceTasks(HADOOP_REDUCERS);
    conf.set("mapred.child.java.opts", HADOOP_JAVAOPTS);
    conf.set("mapred.task.timeout", Long.toString(HADOOP_TIMEOUT));
    conf.setLong("LOCALNODES", HADOOP_LOCALNODES);

    conf.setLong("UP_KMER", UP_KMER);
    conf.setLong("LOW_KMER", LOW_KMER);
    conf.setLong("K", K);
    //conf.setFloat("ERRORRATE", ERRORRATE);
    conf.setFloat("MAJORITY", MAJORITY);
    conf.setFloat("PWM_N", PWM_N);
    conf.setFloat("EXPCOV", EXPCOV);
    conf.setFloat("KMERCOV", KMERCOV);
    conf.setLong("READLENGTH", READLEN);
    conf.setLong("TIPLENGTH", TIPLENGTH);
    conf.setLong("INSLENGTH", INSLEN);
    conf.setLong("INSLENGTH_SD", INSLEN_SD);
    conf.setLong("MAXBUBBLELEN", MAXBUBBLELEN);
    conf.setFloat("BUBBLEEDITRATE", BUBBLEEDITRATE);
    conf.setFloat("LOW_COV_THRESH", LOW_COV_THRESH);
    conf.setLong("MAX_LOW_COV_LEN", MAX_LOW_COV_LEN);
    //conf.setFloat("ERRORRATE", ERRORRATE);

    conf.setLong("N50_TARGET", N50_TARGET);
}

From source file:com.linkedin.mlease.regression.jobs.ItemModelTest.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    List<String> lambdastr = props.getStringList(LAMBDA, ",");
    String outBasePath = props.getString(OUTPUT_BASE_PATH);
    for (String lambda : lambdastr) {
        String outPath = outBasePath + "/lambda-" + lambda;
        props.put("output.path", outPath);
        JobConf conf = createJobConf(PerItemTestMapper.class, PerItemTestReducer.class);
        AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(props.get(MODEL_PATH)), MODEL_PATH);
        conf.set(ITEM_KEY, props.getString(ITEM_KEY));
        conf.setFloat(LAMBDA, Float.parseFloat(lambda));
        conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
        conf.setPartitionerClass(PerItemTestPartitioner.class);
        conf.setInt(NUM_REDUCERS, conf.getNumReduceTasks());
        AvroUtils.runAvroJob(conf);/*  w  w w.ja v  a  2  s.  c o m*/
    }
}

From source file:com.linkedin.mlease.regression.jobs.ItemModelTrain.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    _logger.info("Start training per-key naive logistic regression model...");
    String outBasePath = props.getString(OUTPUT_MODEL_PATH);
    String outpath = outBasePath + "/models";
    props.put("output.path", outpath);
    JobConf conf = createJobConf(ItemModelTrainMapper.class, ItemModelTrainReducer.class,
            Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$),
            LinearModelWithVarAvro.SCHEMA$);
    // set up conf
    String interceptPriorMeanMap = props.getString(INTERCEPT_PRIOR_MEAN_MAP, "");
    if (!interceptPriorMeanMap.equals("")) {
        AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(interceptPriorMeanMap),
                INTERCEPT_PRIOR_MEAN_MAP);
    }//  w w  w . ja  v  a  2s  .  c  o  m
    String lambdaMap = props.getString(LAMBDA_MAP, "");
    if (!lambdaMap.equals("")) {
        AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(lambdaMap), LAMBDA_MAP);
    }
    conf.setFloat(INTERCEPT_DEFAULT_PRIOR_MEAN, (float) props.getDouble(INTERCEPT_DEFAULT_PRIOR_MEAN, 0));
    conf.set(INTERCEPT_LAMBDAS, props.get(INTERCEPT_LAMBDAS));
    conf.set(DEFAULT_LAMBDAS, props.get(DEFAULT_LAMBDAS));
    conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
    conf.setFloat(LIBLINEAR_EPSILON, (float) props.getDouble(LIBLINEAR_EPSILON, 0.001f));
    conf.setBoolean(COMPUTE_VAR, props.getBoolean(COMPUTE_VAR, false));
    conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
    conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));
    // run job
    AvroUtils.runAvroJob(conf);
    boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true);
    if (removeTmpDir) {
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(outBasePath + "/tmp-data"), true);
    }
}

From source file:com.linkedin.mlease.regression.jobs.RegressionAdmmTrain.java

License:Open Source License

@Override
public void run() throws Exception {
    _logger.info("Now running Regression Train using ADMM...");
    JobConfig props = super.getJobConfig();
    String outBasePath = props.getString(OUTPUT_BASE_PATH);
    JobConf conf = super.createJobConf();

    // Various configs
    int nblocks = props.getInt(NUM_BLOCKS);
    int niter = props.getInt(NUM_ITERS, 10);
    //Aggressive decay of liblinear_epsilon
    boolean aggressiveLiblinearEpsilonDecay = props.getBoolean(AGGRESSIVE_LIBLINEAR_EPSILON_DECAY, false);
    // Getting the value of the regularizer L1/L2
    int reg = props.getInt(REGULARIZER);
    if ((reg != 1) && (reg != 2)) {
        throw new IOException("Only L1 and L2 regularization supported!");
    }// w w w.  j ava2  s  . c  om
    int numClickReplicates = props.getInt(NUM_CLICK_REPLICATES, 1);
    boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false);
    float initializeBoostRate = props.getFloat(INITIALIZE_BOOST_RATE, 0);
    float rhoAdaptCoefficient = props.getFloat(RHO_ADAPT_COEFFICIENT, 0);

    // handling lambda and rho
    // initialize z and u and compute z-u and write to hadoop
    Map<String, LinearModel> z = new HashMap<String, LinearModel>(); // lambda ->
    List<String> lambdastr = props.getStringList(LAMBDA, ",");
    List<String> rhostr = props.getStringList(RHO, null, ",");
    if (rhostr != null) {
        if (rhostr.size() != lambdastr.size())
            throw new IOException(
                    "The number of rho's should be exactly the same as the number of lambda's. OR: don't claim rho!");
    }
    Map<Float, Float> lambdaRho = new HashMap<Float, Float>();
    for (int j = 0; j < lambdastr.size(); j++) {
        float lambda = Float.parseFloat(lambdastr.get(j));
        float rho;
        if (rhostr != null) {
            rho = Float.parseFloat(rhostr.get(j));
        } else {
            if (lambda <= 100) {
                rho = 1;
            } else {
                rho = 10;
            }
        }
        lambdaRho.put(lambda, rho);
        z.put(String.valueOf(lambda), new LinearModel());
    }

    // Get specific lambda treatment for some features
    String lambdaMapPath = props.getString(LAMBDA_MAP, "");
    Map<String, Float> lambdaMap = new HashMap<String, Float>();
    if (!lambdaMapPath.equals("")) {
        AvroHdfsFileReader reader = new AvroHdfsFileReader(conf);
        ReadLambdaMapConsumer consumer = new ReadLambdaMapConsumer();
        reader.build(lambdaMapPath, consumer);
        consumer.done();
        lambdaMap = consumer.get();
    }
    _logger.info("Lambda Map has size = " + String.valueOf(lambdaMap.size()));
    // Write lambda_rho mapping into file
    String rhoPath = outBasePath + "/lambda-rho/part-r-00000.avro";
    writeLambdaRho(conf, rhoPath, lambdaRho);

    // test-loglik computation
    boolean testLoglikPerIter = props.getBoolean(TEST_LOGLIK_PER_ITER, false);
    DataFileWriter<GenericRecord> testRecordWriter = null;
    // test if the test file exists
    String testPath = props.getString(TEST_PATH, "");
    testLoglikPerIter = Util.checkPath(testPath);
    if (testLoglikPerIter) {
        List<Path> testPathList = AvroUtils.enumerateFiles(conf, new Path(testPath));
        if (testPathList.size() > 0) {
            testPath = testPathList.get(0).toString();
            _logger.info("Sample test path = " + testPath);

            AvroHdfsFileWriter<GenericRecord> writer = new AvroHdfsFileWriter<GenericRecord>(conf,
                    outBasePath + "/sample-test-loglik/write-test-00000.avro", SampleTestLoglik.SCHEMA$);
            testRecordWriter = writer.get();
        }
    }
    if (testRecordWriter == null) {
        testLoglikPerIter = false;
        _logger.info(
                "test.loglik.per.iter=false or test path doesn't exist or is empty! So we will not output test loglik per iteration.");
    } else {
        testRecordWriter.close();
    }

    MutableFloat bestTestLoglik = new MutableFloat(-9999999);
    //Initialize z by mean model 
    if (initializeBoostRate > 0 && reg == 2) {
        _logger.info("Now start mean model initializing......");
        // Different paths for L1 vs L2 set from job file
        String initalModelPath;
        initalModelPath = outBasePath + "/initialModel";

        Path initalModelPathFromNaiveTrain = new Path(outBasePath, "models");
        JobConfig propsIni = JobConfig.clone(props);
        if (!propsIni.containsKey(LIBLINEAR_EPSILON)) {
            propsIni.put(LIBLINEAR_EPSILON, 0.01);
        }
        propsIni.put(RegressionNaiveTrain.HEAVY_PER_ITEM_TRAIN, "true");
        propsIni.put(LAMBDA_MAP, lambdaMapPath);
        propsIni.put(REMOVE_TMP_DIR, "false");

        // run job
        RegressionNaiveTrain initializationJob = new RegressionNaiveTrain(
                super.getJobId() + "_ADMMInitialization", propsIni);
        initializationJob.run();

        FileSystem fs = initalModelPathFromNaiveTrain.getFileSystem(conf);
        if (fs.exists(new Path(initalModelPath))) {
            fs.delete(new Path(initalModelPath), true);
        }
        fs.rename(initalModelPathFromNaiveTrain, new Path(initalModelPath));
        // set up lambda
        Set<Float> lambdaSet = new HashSet<Float>();
        for (String l : lambdastr) {
            lambdaSet.add(Float.parseFloat(l));
        }
        // Compute Mean model as initial model
        z = LinearModelUtils.meanModel(conf, initalModelPath, nblocks, lambdaSet.size(), true);

        if (testLoglikPerIter) {
            updateLogLikBestModel(conf, 0, z, testPath, ignoreValue, bestTestLoglik, outBasePath,
                    numClickReplicates);
        }
    }

    double mindiff = 99999999;
    float liblinearEpsilon = 0.01f;
    int i;
    for (i = 1; i <= niter; i++) {
        _logger.info("Now starting iteration " + String.valueOf(i));
        // set up configuration
        props.put(AbstractAvroJob.OUTPUT_PATH, outBasePath + "/iter-" + String.valueOf(i));
        conf = createJobConf(AdmmMapper.class, AdmmReducer.class,
                Pair.getPairSchema(Schema.create(Type.INT), RegressionPrepareOutput.SCHEMA$),
                RegressionTrainOutput.SCHEMA$);
        conf.setPartitionerClass(AdmmPartitioner.class);
        //AvroUtils.setSpecificReducerInput(conf, true);
        conf.setInt(NUM_BLOCKS, nblocks);
        //Added for L1/L2
        conf.setInt(REGULARIZER, reg);
        conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
        //boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false);
        conf.setBoolean(BINARY_FEATURE, ignoreValue);
        conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));

        boolean penalizeIntercept = props.getBoolean(PENALIZE_INTERCEPT, false);
        String interceptKey = props.getString(INTERCEPT_KEY, LibLinearDataset.INTERCEPT_NAME);
        conf.set(INTERCEPT_KEY, interceptKey);
        //int schemaType = props.getInt(SCHEMA_TYPE, 1);

        // compute and store u into file
        // u = uplusx - z
        String uPath = outBasePath + "/iter-" + String.valueOf(i) + "/u/part-r-00000.avro";
        if (i == 1) {
            LinearModelUtils.writeLinearModel(conf, uPath, new HashMap<String, LinearModel>());
            if (initializeBoostRate > 0 && reg == 2) {

                conf.setFloat(RHO_ADAPT_RATE, initializeBoostRate);
            }
        } else {
            String uplusxPath = outBasePath + "/iter-" + String.valueOf(i - 1) + "/model";
            computeU(conf, uPath, uplusxPath, z);
            if (rhoAdaptCoefficient > 0) {
                float curRhoAdaptRate = (float) Math.exp(-(i - 1) * rhoAdaptCoefficient);
                conf.setFloat(RHO_ADAPT_RATE, curRhoAdaptRate);
            }
        }
        // write z into file
        String zPath = outBasePath + "/iter-" + String.valueOf(i) + "/init-value/part-r-00000.avro";
        LinearModelUtils.writeLinearModel(conf, zPath, z);

        // run job
        String outpath = outBasePath + "/iter-" + String.valueOf(i) + "/model";
        conf.set(U_PATH, uPath);
        conf.set(INIT_VALUE_PATH, zPath);
        conf.set(LAMBDA_RHO_MAP, rhoPath);
        if (i > 1 && mindiff < 0.001 && !aggressiveLiblinearEpsilonDecay) // need to get a more accurate estimate from liblinear
        {
            liblinearEpsilon = liblinearEpsilon / 10;
        } else if (aggressiveLiblinearEpsilonDecay && i > 5) {
            liblinearEpsilon = liblinearEpsilon / 10;
        }
        conf.setFloat(LIBLINEAR_EPSILON, liblinearEpsilon);
        //Added for logging aggressive decay
        _logger.info("Liblinear Epsilon for iter = " + String.valueOf(i) + " is: "
                + String.valueOf(liblinearEpsilon));
        _logger.info("aggressiveLiblinearEpsilonDecay=" + aggressiveLiblinearEpsilonDecay);
        AvroOutputFormat.setOutputPath(conf, new Path(outpath));
        AvroUtils.addAvroCacheFiles(conf, new Path(uPath));
        AvroUtils.addAvroCacheFiles(conf, new Path(zPath));
        AvroUtils.addAvroCacheFiles(conf, new Path(rhoPath));
        conf.setNumReduceTasks(nblocks * lambdastr.size());
        AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$);
        AvroUtils.runAvroJob(conf);
        // Load the result from the last iteration
        // compute z and u given x

        Map<String, LinearModel> xbar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaRho.size(),
                true);
        Map<String, LinearModel> ubar = LinearModelUtils.meanModel(conf, uPath, nblocks, lambdaRho.size(),
                false);
        Map<String, LinearModel> lastz = new HashMap<String, LinearModel>();
        for (String k : z.keySet()) {
            lastz.put(k, z.get(k).copy());
        }
        for (String lambda : xbar.keySet()) {
            LinearModel thisz = z.get(lambda);
            thisz.clear();
            float l = Float.parseFloat(lambda);
            float r = lambdaRho.get(l);
            double weight;
            //L2 regularization
            if (reg == 2) {
                _logger.info("Running code for regularizer = " + String.valueOf(reg));
                weight = nblocks * r / (l + nblocks * r);
                Map<String, Double> weightmap = new HashMap<String, Double>();
                for (String k : lambdaMap.keySet()) {
                    weightmap.put(k, nblocks * r / (lambdaMap.get(k) + nblocks * r + 0.0));
                }
                thisz.linearCombine(1.0, weight, xbar.get(lambda), weightmap);
                if (!ubar.isEmpty()) {
                    thisz.linearCombine(1.0, weight, ubar.get(lambda), weightmap);
                }
                if (!penalizeIntercept) {
                    if (ubar.isEmpty()) {
                        thisz.setIntercept(xbar.get(lambda).getIntercept());
                    } else {
                        thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept());
                    }
                }
                z.put(lambda, thisz);
            } else {
                // L1 regularization

                _logger.info("Running code for regularizer = " + String.valueOf(reg));
                weight = l / (r * nblocks + 0.0);
                Map<String, Double> weightmap = new HashMap<String, Double>();
                for (String k : lambdaMap.keySet()) {
                    weightmap.put(k, lambdaMap.get(k) / (r * nblocks + 0.0));
                }
                // LinearModel thisz = new LinearModel();
                thisz.linearCombine(1.0, 1.0, xbar.get(lambda));
                if (!ubar.isEmpty()) {
                    thisz.linearCombine(1.0, 1.0, ubar.get(lambda));
                }
                // Iterative Thresholding
                Map<String, Double> thisCoefficients = thisz.getCoefficients();
                for (String k : thisCoefficients.keySet()) {
                    double val = thisCoefficients.get(k);
                    if (val > weight) {
                        thisCoefficients.put(k, val - weight);
                    } else if (val < -weight) {
                        thisCoefficients.put(k, val + weight);
                    }
                }
                thisz.setCoefficients(thisCoefficients);
                if (!penalizeIntercept) {
                    if (ubar.isEmpty()) {
                        thisz.setIntercept(xbar.get(lambda).getIntercept());
                    } else {
                        thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept());
                    }
                }
                z.put(lambda, thisz);
            }
        }
        xbar.clear();
        ubar.clear();
        // Output max difference between last z and this z
        mindiff = 99999999;
        double maxdiff = 0;
        for (String k : z.keySet()) {
            LinearModel tmp = lastz.get(k);
            if (tmp == null)
                tmp = new LinearModel();
            tmp.linearCombine(1, -1, z.get(k));
            double diff = tmp.maxAbsValue();
            _logger.info(
                    "For lambda=" + k + ": Max Difference between last z and this z = " + String.valueOf(diff));
            tmp.clear();
            if (mindiff > diff)
                mindiff = diff;
            if (maxdiff < diff)
                maxdiff = diff;
        }
        double epsilon = props.getDouble(EPSILON, 0.0001);
        // remove tmp files?
        if (props.getBoolean(REMOVE_TMP_DIR, false) && i >= 2) {
            FileSystem fs = FileSystem.get(conf);
            fs.delete(new Path(outBasePath + "/iter-" + String.valueOf(i - 1)), true);
        }
        // Output testloglik and update best model
        if (testLoglikPerIter) {
            updateLogLikBestModel(conf, i, z, testPath, ignoreValue, bestTestLoglik, outBasePath,
                    numClickReplicates);
        }

        if (maxdiff < epsilon && liblinearEpsilon <= 0.00001) {
            break;
        }
    }

    // write z into file
    String zPath = outBasePath + "/final-model/part-r-00000.avro";
    LinearModelUtils.writeLinearModel(conf, zPath, z);
    // remove tmp files?
    if (props.getBoolean(REMOVE_TMP_DIR, false)) {
        FileSystem fs = FileSystem.get(conf);
        Path initalModelPath = new Path(outBasePath + "/initialModel");
        if (fs.exists(initalModelPath)) {
            fs.delete(initalModelPath, true);
        }
        for (int j = i - 2; j <= i; j++) {
            Path deletepath = new Path(outBasePath + "/iter-" + String.valueOf(j));
            if (fs.exists(deletepath)) {
                fs.delete(deletepath, true);
            }
        }
        fs.delete(new Path(outBasePath + "/tmp-data"), true);
    }

}

From source file:com.linkedin.mlease.regression.jobs.RegressionNaiveTrain.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    String outBasePath = props.getString(OUTPUT_BASE_PATH);
    boolean heavyPerItemTrain = props.getBoolean(HEAVY_PER_ITEM_TRAIN, false);

    String partitionIdPath = "";
    if (heavyPerItemTrain) {
        partitionIdPath = outBasePath + "/partitionIds";
        props.put(AbstractAvroJob.OUTPUT_PATH, partitionIdPath);
        JobConf conf = createJobConf(PartitionIdAssignerMapper.class, PartitionIdAssignerReducer.class,
                PartitionIdAssignerCombiner.class,
                Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)),
                Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));
        conf.set(LAMBDA, props.getString(LAMBDA));
        AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$);
        conf.setNumReduceTasks(1);/* w  ww.  j  av a2 s. c  o  m*/
        AvroUtils.runAvroJob(conf);
    }
    _logger.info("Start training per-key naive logistic regression model...");
    String outpath = outBasePath + "/models";
    props.put(AbstractAvroJob.OUTPUT_PATH, outpath);
    JobConf conf = createJobConf(NaiveMapper.class, NaiveReducer.class,
            Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$),
            LinearModelAvro.SCHEMA$);
    // set up conf
    boolean computeModelMean = props.getBoolean(COMPUTE_MODEL_MEAN, true);
    int nblocks = -1;
    if (computeModelMean) {
        nblocks = props.getInt(NUM_BLOCKS);
        conf.setInt(NUM_BLOCKS, nblocks);
    }
    List<String> lambdastr = props.getStringList(LAMBDA, ",");
    conf.set(LAMBDA, props.getString(LAMBDA));
    conf.setFloat(PRIOR_MEAN, props.getFloat(PRIOR_MEAN, 0.0));
    conf.setBoolean(PENALIZE_INTERCEPT, props.getBoolean(PENALIZE_INTERCEPT, false));
    conf.setBoolean(HAS_INTERCEPT, props.getBoolean(HAS_INTERCEPT, true));
    conf.set(INTERCEPT_KEY, props.getString(INTERCEPT_KEY, LIBLINEAR_INTERCEPT_KEY));
    conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
    boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true);
    conf.setFloat(LIBLINEAR_EPSILON, props.getFloat(LIBLINEAR_EPSILON, 0.001f));
    String lambdaMap = props.getString(LAMBDA_MAP, "");
    conf.set(LAMBDA_MAP, lambdaMap);
    if (!lambdaMap.equals("")) {
        AvroUtils.addAvroCacheFiles(conf, new Path(lambdaMap));
    }
    conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
    conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));
    // set up lambda
    Set<Float> lambdaSet = new HashSet<Float>();
    for (String l : lambdastr) {
        lambdaSet.add(Float.parseFloat(l));
    }

    conf.setInt(DATA_SIZE_THRESHOLD, props.getInt(DATA_SIZE_THRESHOLD, 0));
    // set up partition id
    if (heavyPerItemTrain && !partitionIdPath.equals("")) {
        conf.set(PARTITION_ID_PATH, partitionIdPath);
        AvroHdfsFileReader reader = new AvroHdfsFileReader(conf);
        ReadPartitionIdAssignmentConsumer consumer = new ReadPartitionIdAssignmentConsumer();
        reader.build(partitionIdPath, consumer);
        Map<String, Integer> partitionIdMap = consumer.get();
        int maxPartitionId = 0;
        for (int v : partitionIdMap.values()) {
            if (v > maxPartitionId) {
                maxPartitionId = v;
            }
        }
        AvroUtils.addAvroCacheFiles(conf, new Path(partitionIdPath));
        conf.setNumReduceTasks(maxPartitionId + 1);
        conf.setPartitionerClass(NaivePartitioner.class);
    }
    // run job
    AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$);
    AvroUtils.runAvroJob(conf);
    // Compute Mean
    if (computeModelMean) {
        Map<String, LinearModel> betabar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaSet.size(),
                true);
        // Output the mean for each lambda
        // write z into file
        String finalOutPath = outBasePath + "/final-model/part-r-00000.avro";
        LinearModelUtils.writeLinearModel(conf, finalOutPath, betabar);
    }
    // remove tmp dir
    if (removeTmpDir) {
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(outBasePath + "/tmp-data"), true);
    }
}

From source file:com.linkedin.mlease.regression.jobs.RegressionTest.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    JobConf conf = super.createJobConf();
    if (!props.getString("input.paths").equals("")) {
        // set up configuration
        _logger.info("Now starting test...");
        List<String> lambdastr = props.getStringList(LAMBDA, ",");
        String outBasePath = props.getString(OUTPUT_BASE_PATH);
        for (String lambda : lambdastr) {
            String outPath = outBasePath + "/lambda-" + lambda;
            props.put(AbstractAvroJob.OUTPUT_PATH, outPath);
            conf = createJobConf(AdmmTestMapper.class, AdmmTestReducer.class);
            AvroOutputFormat.setOutputPath(conf, new Path(outPath));
            String modelPath = props.getString(MODEL_BASE_PATH);
            modelPath = modelPath + "/final-model";
            AvroUtils.addAvroCacheFiles(conf, new Path(modelPath));
            conf.set(MODEL_PATH, modelPath);
            conf.setFloat(LAMBDA, Float.parseFloat(lambda));
            conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
            AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf));
            AvroUtils.runAvroJob(conf);//from w w w  . j av a2  s.c  o  m
        }
        // also do full prediction on best-model if it exists
        FileSystem fs = FileSystem.get(conf);
        String modelPath = props.getString(MODEL_BASE_PATH) + "/best-model";
        if (fs.exists(new Path(modelPath))) {
            String outPath = outBasePath + "/best-model";
            props.put(AbstractAvroJob.OUTPUT_PATH, outPath);
            conf = createJobConf(AdmmTestMapper.class, AdmmTestReducer.class);
            AvroOutputFormat.setOutputPath(conf, new Path(outPath));
            AvroUtils.addAvroCacheFiles(conf, new Path(modelPath));
            conf.set(MODEL_PATH, modelPath);
            conf.setFloat(LAMBDA, -1);
            conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
            AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf));
            AvroUtils.runAvroJob(conf);
        }
    } else {
        _logger.info("test.input.paths is empty! So no test will be done!");
    }
}

From source file:com.ricemap.spateDB.operations.Sampler.java

License:Apache License

/**
 * Sample a ratio of the file through a MapReduce job
 * @param fs// w w w . j  av  a  2  s.  c o m
 * @param files
 * @param ratio
 * @param threshold - Maximum number of elements to be sampled
 * @param output
 * @param inObj
 * @return
 * @throws IOException
 */
public static <T extends TextSerializable, O extends TextSerializable> int sampleMapReduceWithRatio(
        FileSystem fs, Path[] files, double ratio, long threshold, long seed, final ResultCollector<O> output,
        T inObj, O outObj) throws IOException {
    JobConf job = new JobConf(FileMBR.class);

    Path outputPath;
    FileSystem outFs = FileSystem.get(job);
    do {
        outputPath = new Path(files[0].toUri().getPath() + ".sample_" + (int) (Math.random() * 1000000));
    } while (outFs.exists(outputPath));

    job.setJobName("Sample");
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setClass(InClass, inObj.getClass(), TextSerializable.class);
    job.setClass(OutClass, outObj.getClass(), TextSerializable.class);

    job.setMapperClass(Map.class);
    job.setLong(RANDOM_SEED, seed);
    job.setFloat(SAMPLE_RATIO, (float) ratio);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(0);

    job.setInputFormat(ShapeLineInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);

    ShapeLineInputFormat.setInputPaths(job, files);
    TextOutputFormat.setOutputPath(job, outputPath);

    // Submit the job
    RunningJob run_job = JobClient.runJob(job);

    Counters counters = run_job.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.MAP_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    Counter inputBytesCounter = counters.findCounter(Task.Counter.MAP_INPUT_BYTES);
    Sampler.sizeOfLastProcessedFile = inputBytesCounter.getValue();

    // Ratio of records to return from output based on the threshold
    // Note that any number greater than or equal to one will cause all
    // elements to be returned
    final double selectRatio = (double) threshold / resultCount;

    // Read job result
    int result_size = 0;
    if (output != null) {
        Text line = new Text();
        FileStatus[] results = outFs.listStatus(outputPath);

        for (FileStatus fileStatus : results) {
            if (fileStatus.getLen() > 0 && fileStatus.getPath().getName().startsWith("part-")) {
                LineReader lineReader = new LineReader(outFs.open(fileStatus.getPath()));
                try {
                    while (lineReader.readLine(line) > 0) {
                        if (Math.random() < selectRatio) {
                            if (output != null) {
                                outObj.fromText(line);
                                output.collect(outObj);
                            }
                            result_size++;
                        }
                    }
                } catch (RuntimeException e) {
                    e.printStackTrace();
                }
                lineReader.close();
            }
        }
    }

    outFs.delete(outputPath, true);

    return result_size;
}

From source file:edu.ub.ahstfg.hadoop.ParamSet.java

License:Open Source License

/**
 * Transfers the parameters to a Hadoop job.
 * @param job Job where parameters will be transfered.
 *//*  ww w  .j  a v  a 2 s.  c  o  m*/
public void toJobConf(JobConf job) {
    for (String key : strings.keySet()) {
        job.set(key, strings.get(key));
    }
    for (String key : ints.keySet()) {
        job.setInt(key, ints.get(key));
    }
    for (String key : floats.keySet()) {
        job.setFloat(key, floats.get(key));
    }
}

From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java

License:Apache License

private void phase2(String path, int i, int j, int n, float missing) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    sLogger.info("missing PageRank mass: " + missing);
    sLogger.info("number of nodes: " + n);

    String in = path + "/iter" + sFormat.format(j) + "t";
    String out = path + "/iter" + sFormat.format(j);

    sLogger.info("PageRank: iteration " + j + ": Phase2");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);

    int numMapTasks = FileSystem.get(conf).listStatus(new Path(in)).length;
    int numReduceTasks = 0;

    conf.setJobName("PageRank:Basic:iteration" + j + ":Phase2");
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.setFloat("MissingMass", (float) missing);
    conf.setInt("NodeCount", n);

    conf.setNumMapTasks(numMapTasks);//from  ww w .jav a 2s . c o  m
    conf.setNumReduceTasks(numReduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapPageRankMassDistributionClass.class);
    conf.setCombinerClass(IdentityReducer.class);
    conf.setReducerClass(IdentityReducer.class);

    FileSystem.get(conf).delete(new Path(out), true);

    JobClient.runJob(conf);
}