Example usage for org.apache.hadoop.mapred JobConf set

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf set.

Prototype

public void set(String name, String value)

Source Link

Document

Set the value of the name property.

Usage

From source file:com.linkedin.mapred.AvroUtils.java

License:Open Source License

public static void addAvroCacheFilesAndSetTheProperty(JobConf conf, Path inputPath, String property)
        throws Exception {
    addAvroCacheFiles(conf, inputPath);//from  w ww.  j  av a 2 s  . c  om
    conf.set(property, inputPath.toString());
}

From source file:com.linkedin.mlease.regression.jobs.ItemModelTest.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    List<String> lambdastr = props.getStringList(LAMBDA, ",");
    String outBasePath = props.getString(OUTPUT_BASE_PATH);
    for (String lambda : lambdastr) {
        String outPath = outBasePath + "/lambda-" + lambda;
        props.put("output.path", outPath);
        JobConf conf = createJobConf(PerItemTestMapper.class, PerItemTestReducer.class);
        AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(props.get(MODEL_PATH)), MODEL_PATH);
        conf.set(ITEM_KEY, props.getString(ITEM_KEY));
        conf.setFloat(LAMBDA, Float.parseFloat(lambda));
        conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
        conf.setPartitionerClass(PerItemTestPartitioner.class);
        conf.setInt(NUM_REDUCERS, conf.getNumReduceTasks());
        AvroUtils.runAvroJob(conf);/*w  w  w. ja v a  2  s.c  om*/
    }
}

From source file:com.linkedin.mlease.regression.jobs.ItemModelTrain.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    _logger.info("Start training per-key naive logistic regression model...");
    String outBasePath = props.getString(OUTPUT_MODEL_PATH);
    String outpath = outBasePath + "/models";
    props.put("output.path", outpath);
    JobConf conf = createJobConf(ItemModelTrainMapper.class, ItemModelTrainReducer.class,
            Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$),
            LinearModelWithVarAvro.SCHEMA$);
    // set up conf
    String interceptPriorMeanMap = props.getString(INTERCEPT_PRIOR_MEAN_MAP, "");
    if (!interceptPriorMeanMap.equals("")) {
        AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(interceptPriorMeanMap),
                INTERCEPT_PRIOR_MEAN_MAP);
    }//from   ww  w.  j  ava2s .c om
    String lambdaMap = props.getString(LAMBDA_MAP, "");
    if (!lambdaMap.equals("")) {
        AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(lambdaMap), LAMBDA_MAP);
    }
    conf.setFloat(INTERCEPT_DEFAULT_PRIOR_MEAN, (float) props.getDouble(INTERCEPT_DEFAULT_PRIOR_MEAN, 0));
    conf.set(INTERCEPT_LAMBDAS, props.get(INTERCEPT_LAMBDAS));
    conf.set(DEFAULT_LAMBDAS, props.get(DEFAULT_LAMBDAS));
    conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
    conf.setFloat(LIBLINEAR_EPSILON, (float) props.getDouble(LIBLINEAR_EPSILON, 0.001f));
    conf.setBoolean(COMPUTE_VAR, props.getBoolean(COMPUTE_VAR, false));
    conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
    conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));
    // run job
    AvroUtils.runAvroJob(conf);
    boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true);
    if (removeTmpDir) {
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(outBasePath + "/tmp-data"), true);
    }
}

From source file:com.linkedin.mlease.regression.jobs.RegressionAdmmTrain.java

License:Open Source License

@Override
public void run() throws Exception {
    _logger.info("Now running Regression Train using ADMM...");
    JobConfig props = super.getJobConfig();
    String outBasePath = props.getString(OUTPUT_BASE_PATH);
    JobConf conf = super.createJobConf();

    // Various configs
    int nblocks = props.getInt(NUM_BLOCKS);
    int niter = props.getInt(NUM_ITERS, 10);
    //Aggressive decay of liblinear_epsilon
    boolean aggressiveLiblinearEpsilonDecay = props.getBoolean(AGGRESSIVE_LIBLINEAR_EPSILON_DECAY, false);
    // Getting the value of the regularizer L1/L2
    int reg = props.getInt(REGULARIZER);
    if ((reg != 1) && (reg != 2)) {
        throw new IOException("Only L1 and L2 regularization supported!");
    }//  w w w. j av  a 2 s. c o m
    int numClickReplicates = props.getInt(NUM_CLICK_REPLICATES, 1);
    boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false);
    float initializeBoostRate = props.getFloat(INITIALIZE_BOOST_RATE, 0);
    float rhoAdaptCoefficient = props.getFloat(RHO_ADAPT_COEFFICIENT, 0);

    // handling lambda and rho
    // initialize z and u and compute z-u and write to hadoop
    Map<String, LinearModel> z = new HashMap<String, LinearModel>(); // lambda ->
    List<String> lambdastr = props.getStringList(LAMBDA, ",");
    List<String> rhostr = props.getStringList(RHO, null, ",");
    if (rhostr != null) {
        if (rhostr.size() != lambdastr.size())
            throw new IOException(
                    "The number of rho's should be exactly the same as the number of lambda's. OR: don't claim rho!");
    }
    Map<Float, Float> lambdaRho = new HashMap<Float, Float>();
    for (int j = 0; j < lambdastr.size(); j++) {
        float lambda = Float.parseFloat(lambdastr.get(j));
        float rho;
        if (rhostr != null) {
            rho = Float.parseFloat(rhostr.get(j));
        } else {
            if (lambda <= 100) {
                rho = 1;
            } else {
                rho = 10;
            }
        }
        lambdaRho.put(lambda, rho);
        z.put(String.valueOf(lambda), new LinearModel());
    }

    // Get specific lambda treatment for some features
    String lambdaMapPath = props.getString(LAMBDA_MAP, "");
    Map<String, Float> lambdaMap = new HashMap<String, Float>();
    if (!lambdaMapPath.equals("")) {
        AvroHdfsFileReader reader = new AvroHdfsFileReader(conf);
        ReadLambdaMapConsumer consumer = new ReadLambdaMapConsumer();
        reader.build(lambdaMapPath, consumer);
        consumer.done();
        lambdaMap = consumer.get();
    }
    _logger.info("Lambda Map has size = " + String.valueOf(lambdaMap.size()));
    // Write lambda_rho mapping into file
    String rhoPath = outBasePath + "/lambda-rho/part-r-00000.avro";
    writeLambdaRho(conf, rhoPath, lambdaRho);

    // test-loglik computation
    boolean testLoglikPerIter = props.getBoolean(TEST_LOGLIK_PER_ITER, false);
    DataFileWriter<GenericRecord> testRecordWriter = null;
    // test if the test file exists
    String testPath = props.getString(TEST_PATH, "");
    testLoglikPerIter = Util.checkPath(testPath);
    if (testLoglikPerIter) {
        List<Path> testPathList = AvroUtils.enumerateFiles(conf, new Path(testPath));
        if (testPathList.size() > 0) {
            testPath = testPathList.get(0).toString();
            _logger.info("Sample test path = " + testPath);

            AvroHdfsFileWriter<GenericRecord> writer = new AvroHdfsFileWriter<GenericRecord>(conf,
                    outBasePath + "/sample-test-loglik/write-test-00000.avro", SampleTestLoglik.SCHEMA$);
            testRecordWriter = writer.get();
        }
    }
    if (testRecordWriter == null) {
        testLoglikPerIter = false;
        _logger.info(
                "test.loglik.per.iter=false or test path doesn't exist or is empty! So we will not output test loglik per iteration.");
    } else {
        testRecordWriter.close();
    }

    MutableFloat bestTestLoglik = new MutableFloat(-9999999);
    //Initialize z by mean model 
    if (initializeBoostRate > 0 && reg == 2) {
        _logger.info("Now start mean model initializing......");
        // Different paths for L1 vs L2 set from job file
        String initalModelPath;
        initalModelPath = outBasePath + "/initialModel";

        Path initalModelPathFromNaiveTrain = new Path(outBasePath, "models");
        JobConfig propsIni = JobConfig.clone(props);
        if (!propsIni.containsKey(LIBLINEAR_EPSILON)) {
            propsIni.put(LIBLINEAR_EPSILON, 0.01);
        }
        propsIni.put(RegressionNaiveTrain.HEAVY_PER_ITEM_TRAIN, "true");
        propsIni.put(LAMBDA_MAP, lambdaMapPath);
        propsIni.put(REMOVE_TMP_DIR, "false");

        // run job
        RegressionNaiveTrain initializationJob = new RegressionNaiveTrain(
                super.getJobId() + "_ADMMInitialization", propsIni);
        initializationJob.run();

        FileSystem fs = initalModelPathFromNaiveTrain.getFileSystem(conf);
        if (fs.exists(new Path(initalModelPath))) {
            fs.delete(new Path(initalModelPath), true);
        }
        fs.rename(initalModelPathFromNaiveTrain, new Path(initalModelPath));
        // set up lambda
        Set<Float> lambdaSet = new HashSet<Float>();
        for (String l : lambdastr) {
            lambdaSet.add(Float.parseFloat(l));
        }
        // Compute Mean model as initial model
        z = LinearModelUtils.meanModel(conf, initalModelPath, nblocks, lambdaSet.size(), true);

        if (testLoglikPerIter) {
            updateLogLikBestModel(conf, 0, z, testPath, ignoreValue, bestTestLoglik, outBasePath,
                    numClickReplicates);
        }
    }

    double mindiff = 99999999;
    float liblinearEpsilon = 0.01f;
    int i;
    for (i = 1; i <= niter; i++) {
        _logger.info("Now starting iteration " + String.valueOf(i));
        // set up configuration
        props.put(AbstractAvroJob.OUTPUT_PATH, outBasePath + "/iter-" + String.valueOf(i));
        conf = createJobConf(AdmmMapper.class, AdmmReducer.class,
                Pair.getPairSchema(Schema.create(Type.INT), RegressionPrepareOutput.SCHEMA$),
                RegressionTrainOutput.SCHEMA$);
        conf.setPartitionerClass(AdmmPartitioner.class);
        //AvroUtils.setSpecificReducerInput(conf, true);
        conf.setInt(NUM_BLOCKS, nblocks);
        //Added for L1/L2
        conf.setInt(REGULARIZER, reg);
        conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
        //boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false);
        conf.setBoolean(BINARY_FEATURE, ignoreValue);
        conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));

        boolean penalizeIntercept = props.getBoolean(PENALIZE_INTERCEPT, false);
        String interceptKey = props.getString(INTERCEPT_KEY, LibLinearDataset.INTERCEPT_NAME);
        conf.set(INTERCEPT_KEY, interceptKey);
        //int schemaType = props.getInt(SCHEMA_TYPE, 1);

        // compute and store u into file
        // u = uplusx - z
        String uPath = outBasePath + "/iter-" + String.valueOf(i) + "/u/part-r-00000.avro";
        if (i == 1) {
            LinearModelUtils.writeLinearModel(conf, uPath, new HashMap<String, LinearModel>());
            if (initializeBoostRate > 0 && reg == 2) {

                conf.setFloat(RHO_ADAPT_RATE, initializeBoostRate);
            }
        } else {
            String uplusxPath = outBasePath + "/iter-" + String.valueOf(i - 1) + "/model";
            computeU(conf, uPath, uplusxPath, z);
            if (rhoAdaptCoefficient > 0) {
                float curRhoAdaptRate = (float) Math.exp(-(i - 1) * rhoAdaptCoefficient);
                conf.setFloat(RHO_ADAPT_RATE, curRhoAdaptRate);
            }
        }
        // write z into file
        String zPath = outBasePath + "/iter-" + String.valueOf(i) + "/init-value/part-r-00000.avro";
        LinearModelUtils.writeLinearModel(conf, zPath, z);

        // run job
        String outpath = outBasePath + "/iter-" + String.valueOf(i) + "/model";
        conf.set(U_PATH, uPath);
        conf.set(INIT_VALUE_PATH, zPath);
        conf.set(LAMBDA_RHO_MAP, rhoPath);
        if (i > 1 && mindiff < 0.001 && !aggressiveLiblinearEpsilonDecay) // need to get a more accurate estimate from liblinear
        {
            liblinearEpsilon = liblinearEpsilon / 10;
        } else if (aggressiveLiblinearEpsilonDecay && i > 5) {
            liblinearEpsilon = liblinearEpsilon / 10;
        }
        conf.setFloat(LIBLINEAR_EPSILON, liblinearEpsilon);
        //Added for logging aggressive decay
        _logger.info("Liblinear Epsilon for iter = " + String.valueOf(i) + " is: "
                + String.valueOf(liblinearEpsilon));
        _logger.info("aggressiveLiblinearEpsilonDecay=" + aggressiveLiblinearEpsilonDecay);
        AvroOutputFormat.setOutputPath(conf, new Path(outpath));
        AvroUtils.addAvroCacheFiles(conf, new Path(uPath));
        AvroUtils.addAvroCacheFiles(conf, new Path(zPath));
        AvroUtils.addAvroCacheFiles(conf, new Path(rhoPath));
        conf.setNumReduceTasks(nblocks * lambdastr.size());
        AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$);
        AvroUtils.runAvroJob(conf);
        // Load the result from the last iteration
        // compute z and u given x

        Map<String, LinearModel> xbar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaRho.size(),
                true);
        Map<String, LinearModel> ubar = LinearModelUtils.meanModel(conf, uPath, nblocks, lambdaRho.size(),
                false);
        Map<String, LinearModel> lastz = new HashMap<String, LinearModel>();
        for (String k : z.keySet()) {
            lastz.put(k, z.get(k).copy());
        }
        for (String lambda : xbar.keySet()) {
            LinearModel thisz = z.get(lambda);
            thisz.clear();
            float l = Float.parseFloat(lambda);
            float r = lambdaRho.get(l);
            double weight;
            //L2 regularization
            if (reg == 2) {
                _logger.info("Running code for regularizer = " + String.valueOf(reg));
                weight = nblocks * r / (l + nblocks * r);
                Map<String, Double> weightmap = new HashMap<String, Double>();
                for (String k : lambdaMap.keySet()) {
                    weightmap.put(k, nblocks * r / (lambdaMap.get(k) + nblocks * r + 0.0));
                }
                thisz.linearCombine(1.0, weight, xbar.get(lambda), weightmap);
                if (!ubar.isEmpty()) {
                    thisz.linearCombine(1.0, weight, ubar.get(lambda), weightmap);
                }
                if (!penalizeIntercept) {
                    if (ubar.isEmpty()) {
                        thisz.setIntercept(xbar.get(lambda).getIntercept());
                    } else {
                        thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept());
                    }
                }
                z.put(lambda, thisz);
            } else {
                // L1 regularization

                _logger.info("Running code for regularizer = " + String.valueOf(reg));
                weight = l / (r * nblocks + 0.0);
                Map<String, Double> weightmap = new HashMap<String, Double>();
                for (String k : lambdaMap.keySet()) {
                    weightmap.put(k, lambdaMap.get(k) / (r * nblocks + 0.0));
                }
                // LinearModel thisz = new LinearModel();
                thisz.linearCombine(1.0, 1.0, xbar.get(lambda));
                if (!ubar.isEmpty()) {
                    thisz.linearCombine(1.0, 1.0, ubar.get(lambda));
                }
                // Iterative Thresholding
                Map<String, Double> thisCoefficients = thisz.getCoefficients();
                for (String k : thisCoefficients.keySet()) {
                    double val = thisCoefficients.get(k);
                    if (val > weight) {
                        thisCoefficients.put(k, val - weight);
                    } else if (val < -weight) {
                        thisCoefficients.put(k, val + weight);
                    }
                }
                thisz.setCoefficients(thisCoefficients);
                if (!penalizeIntercept) {
                    if (ubar.isEmpty()) {
                        thisz.setIntercept(xbar.get(lambda).getIntercept());
                    } else {
                        thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept());
                    }
                }
                z.put(lambda, thisz);
            }
        }
        xbar.clear();
        ubar.clear();
        // Output max difference between last z and this z
        mindiff = 99999999;
        double maxdiff = 0;
        for (String k : z.keySet()) {
            LinearModel tmp = lastz.get(k);
            if (tmp == null)
                tmp = new LinearModel();
            tmp.linearCombine(1, -1, z.get(k));
            double diff = tmp.maxAbsValue();
            _logger.info(
                    "For lambda=" + k + ": Max Difference between last z and this z = " + String.valueOf(diff));
            tmp.clear();
            if (mindiff > diff)
                mindiff = diff;
            if (maxdiff < diff)
                maxdiff = diff;
        }
        double epsilon = props.getDouble(EPSILON, 0.0001);
        // remove tmp files?
        if (props.getBoolean(REMOVE_TMP_DIR, false) && i >= 2) {
            FileSystem fs = FileSystem.get(conf);
            fs.delete(new Path(outBasePath + "/iter-" + String.valueOf(i - 1)), true);
        }
        // Output testloglik and update best model
        if (testLoglikPerIter) {
            updateLogLikBestModel(conf, i, z, testPath, ignoreValue, bestTestLoglik, outBasePath,
                    numClickReplicates);
        }

        if (maxdiff < epsilon && liblinearEpsilon <= 0.00001) {
            break;
        }
    }

    // write z into file
    String zPath = outBasePath + "/final-model/part-r-00000.avro";
    LinearModelUtils.writeLinearModel(conf, zPath, z);
    // remove tmp files?
    if (props.getBoolean(REMOVE_TMP_DIR, false)) {
        FileSystem fs = FileSystem.get(conf);
        Path initalModelPath = new Path(outBasePath + "/initialModel");
        if (fs.exists(initalModelPath)) {
            fs.delete(initalModelPath, true);
        }
        for (int j = i - 2; j <= i; j++) {
            Path deletepath = new Path(outBasePath + "/iter-" + String.valueOf(j));
            if (fs.exists(deletepath)) {
                fs.delete(deletepath, true);
            }
        }
        fs.delete(new Path(outBasePath + "/tmp-data"), true);
    }

}

From source file:com.linkedin.mlease.regression.jobs.RegressionNaiveTrain.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    String outBasePath = props.getString(OUTPUT_BASE_PATH);
    boolean heavyPerItemTrain = props.getBoolean(HEAVY_PER_ITEM_TRAIN, false);

    String partitionIdPath = "";
    if (heavyPerItemTrain) {
        partitionIdPath = outBasePath + "/partitionIds";
        props.put(AbstractAvroJob.OUTPUT_PATH, partitionIdPath);
        JobConf conf = createJobConf(PartitionIdAssignerMapper.class, PartitionIdAssignerReducer.class,
                PartitionIdAssignerCombiner.class,
                Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)),
                Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));
        conf.set(LAMBDA, props.getString(LAMBDA));
        AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$);
        conf.setNumReduceTasks(1);/*from  ww  w.j  a  v a 2 s . c om*/
        AvroUtils.runAvroJob(conf);
    }
    _logger.info("Start training per-key naive logistic regression model...");
    String outpath = outBasePath + "/models";
    props.put(AbstractAvroJob.OUTPUT_PATH, outpath);
    JobConf conf = createJobConf(NaiveMapper.class, NaiveReducer.class,
            Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$),
            LinearModelAvro.SCHEMA$);
    // set up conf
    boolean computeModelMean = props.getBoolean(COMPUTE_MODEL_MEAN, true);
    int nblocks = -1;
    if (computeModelMean) {
        nblocks = props.getInt(NUM_BLOCKS);
        conf.setInt(NUM_BLOCKS, nblocks);
    }
    List<String> lambdastr = props.getStringList(LAMBDA, ",");
    conf.set(LAMBDA, props.getString(LAMBDA));
    conf.setFloat(PRIOR_MEAN, props.getFloat(PRIOR_MEAN, 0.0));
    conf.setBoolean(PENALIZE_INTERCEPT, props.getBoolean(PENALIZE_INTERCEPT, false));
    conf.setBoolean(HAS_INTERCEPT, props.getBoolean(HAS_INTERCEPT, true));
    conf.set(INTERCEPT_KEY, props.getString(INTERCEPT_KEY, LIBLINEAR_INTERCEPT_KEY));
    conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000));
    boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true);
    conf.setFloat(LIBLINEAR_EPSILON, props.getFloat(LIBLINEAR_EPSILON, 0.001f));
    String lambdaMap = props.getString(LAMBDA_MAP, "");
    conf.set(LAMBDA_MAP, lambdaMap);
    if (!lambdaMap.equals("")) {
        AvroUtils.addAvroCacheFiles(conf, new Path(lambdaMap));
    }
    conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
    conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false));
    // set up lambda
    Set<Float> lambdaSet = new HashSet<Float>();
    for (String l : lambdastr) {
        lambdaSet.add(Float.parseFloat(l));
    }

    conf.setInt(DATA_SIZE_THRESHOLD, props.getInt(DATA_SIZE_THRESHOLD, 0));
    // set up partition id
    if (heavyPerItemTrain && !partitionIdPath.equals("")) {
        conf.set(PARTITION_ID_PATH, partitionIdPath);
        AvroHdfsFileReader reader = new AvroHdfsFileReader(conf);
        ReadPartitionIdAssignmentConsumer consumer = new ReadPartitionIdAssignmentConsumer();
        reader.build(partitionIdPath, consumer);
        Map<String, Integer> partitionIdMap = consumer.get();
        int maxPartitionId = 0;
        for (int v : partitionIdMap.values()) {
            if (v > maxPartitionId) {
                maxPartitionId = v;
            }
        }
        AvroUtils.addAvroCacheFiles(conf, new Path(partitionIdPath));
        conf.setNumReduceTasks(maxPartitionId + 1);
        conf.setPartitionerClass(NaivePartitioner.class);
    }
    // run job
    AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$);
    AvroUtils.runAvroJob(conf);
    // Compute Mean
    if (computeModelMean) {
        Map<String, LinearModel> betabar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaSet.size(),
                true);
        // Output the mean for each lambda
        // write z into file
        String finalOutPath = outBasePath + "/final-model/part-r-00000.avro";
        LinearModelUtils.writeLinearModel(conf, finalOutPath, betabar);
    }
    // remove tmp dir
    if (removeTmpDir) {
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(outBasePath + "/tmp-data"), true);
    }
}

From source file:com.linkedin.mlease.regression.jobs.RegressionPrepare.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig config = super.getJobConfig();
    JobConf conf = super.createJobConf(RegressionPrepareMapper.class, RegressionPrepareOutput.SCHEMA$);
    String mapKey = config.getString(MAP_KEY, "");
    conf.set(MAP_KEY, mapKey);
    conf.setInt(NUM_CLICK_REPLICATES, config.getInt(NUM_CLICK_REPLICATES, 1));
    conf.setBoolean(IGNORE_FEATURE_VALUE, config.getBoolean(IGNORE_FEATURE_VALUE, false));
    int nblocks = config.getInt(NUM_BLOCKS, 0);
    conf.setInt(NUM_BLOCKS, nblocks);/*from  ww w .  j  ava  2s  . c o m*/
    _logger.info("Running the preparation job of admm with map.key = " + mapKey + " and num.blocks=" + nblocks);
    AvroUtils.runAvroJob(conf);
}

From source file:com.linkedin.mlease.regression.jobs.RegressionTest.java

License:Open Source License

@Override
public void run() throws Exception {
    JobConfig props = super.getJobConfig();
    JobConf conf = super.createJobConf();
    if (!props.getString("input.paths").equals("")) {
        // set up configuration
        _logger.info("Now starting test...");
        List<String> lambdastr = props.getStringList(LAMBDA, ",");
        String outBasePath = props.getString(OUTPUT_BASE_PATH);
        for (String lambda : lambdastr) {
            String outPath = outBasePath + "/lambda-" + lambda;
            props.put(AbstractAvroJob.OUTPUT_PATH, outPath);
            conf = createJobConf(AdmmTestMapper.class, AdmmTestReducer.class);
            AvroOutputFormat.setOutputPath(conf, new Path(outPath));
            String modelPath = props.getString(MODEL_BASE_PATH);
            modelPath = modelPath + "/final-model";
            AvroUtils.addAvroCacheFiles(conf, new Path(modelPath));
            conf.set(MODEL_PATH, modelPath);
            conf.setFloat(LAMBDA, Float.parseFloat(lambda));
            conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
            AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf));
            AvroUtils.runAvroJob(conf);//w  w  w. j  a va  2  s .c  o  m
        }
        // also do full prediction on best-model if it exists
        FileSystem fs = FileSystem.get(conf);
        String modelPath = props.getString(MODEL_BASE_PATH) + "/best-model";
        if (fs.exists(new Path(modelPath))) {
            String outPath = outBasePath + "/best-model";
            props.put(AbstractAvroJob.OUTPUT_PATH, outPath);
            conf = createJobConf(AdmmTestMapper.class, AdmmTestReducer.class);
            AvroOutputFormat.setOutputPath(conf, new Path(outPath));
            AvroUtils.addAvroCacheFiles(conf, new Path(modelPath));
            conf.set(MODEL_PATH, modelPath);
            conf.setFloat(LAMBDA, -1);
            conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false));
            AvroJob.setInputSchema(conf, AvroUtils.getAvroInputSchema(conf));
            AvroUtils.runAvroJob(conf);
        }
    } else {
        _logger.info("test.input.paths is empty! So no test will be done!");
    }
}

From source file:com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy.java

License:Apache License

private void prepareBloomFilterBuilder(FlowStep<JobConf> currentStep) {
    JobConf currentStepConf = currentStep.getConfig();
    currentStepConf.set("mapred.reduce.tasks", Integer.toString(BloomProps.getNumSplits(currentStepConf)));
    currentStepConf.set("io.sort.record.percent",
            Double.toString(BloomProps.getIOSortPercent(currentStepConf)));
}

From source file:com.liveramp.cascading_ext.bloom.BloomAssemblyStrategy.java

License:Apache License

/**
 * Merges bloom filter parts created across multiple splits of the keys and put the result in the distributed cache.
 *///from w w w  .j a v  a2 s  .  c  o  m
private void buildBloomfilter(String bloomID, FlowStep<JobConf> currentStep,
        List<FlowStep<JobConf>> predecessorSteps) {
    try {
        JobConf currentStepConf = currentStep.getConfig();
        currentStepConf.set("io.sort.mb", Integer.toString(BloomProps.getBufferSize(currentStepConf)));
        currentStepConf.set("mapred.job.reuse.jvm.num.tasks", "-1");

        String requiredBloomPath = currentStepConf.get(BloomProps.REQUIRED_BLOOM_FILTER_PATH);

        for (FlowStep<JobConf> step : predecessorSteps) {
            JobConf prevStepConf = step.getConfig();
            String targetBloomID = prevStepConf.get(BloomProps.TARGET_BLOOM_FILTER_ID);

            if (bloomID.equals(targetBloomID)) {
                LOG.info("Found step generating required bloom filter: " + targetBloomID);

                // Extract the counters from the previous job to approximate the average key/tuple size
                FlowStepStats stats = ((BaseFlowStep) step).getFlowStepStats();

                // Collect some of the stats gathered. This will help configure the bloom filter
                long numSampled = Counters.get(stats, CreateBloomFilter.StatsCounters.TOTAL_SAMPLED_TUPLES);
                long keySizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.KEY_SIZE_SUM);
                long matchSizeSum = Counters.get(stats, CreateBloomFilter.StatsCounters.TUPLE_SIZE_SUM);

                int avgKeySize = 0;
                int avgMatchSize = 0;

                if (numSampled != 0) {
                    avgKeySize = (int) (keySizeSum / numSampled);
                    avgMatchSize = (int) (matchSizeSum / numSampled);
                }

                LOG.info("Avg key size ~= " + avgKeySize);
                LOG.info("Avg match size ~= " + avgMatchSize);
                for (Map.Entry<String, String> entry : BloomUtil
                        .getPropertiesForBloomFilter(avgMatchSize, avgKeySize).entrySet()) {
                    currentStepConf.set(entry.getKey(), entry.getValue());
                }

                // Put merged result in distributed cache
                LOG.info("Adding dist cache properties to config:");
                for (Map.Entry<String, String> prop : BloomUtil.getPropertiesForDistCache(requiredBloomPath)
                        .entrySet()) {
                    LOG.info(prop.getKey() + " = " + prop.getValue());
                    String previousProperty = currentStepConf.get(prop.getKey());
                    if (previousProperty != null) {
                        LOG.info("found already existing value for key: " + prop.getKey() + ", found "
                                + previousProperty + ".  Appending.");
                        currentStepConf.set(prop.getKey(), previousProperty + "," + prop.getValue());
                    } else {
                        currentStepConf.set(prop.getKey(), prop.getValue());
                    }
                }

                BloomUtil.writeFilterToHdfs(prevStepConf, requiredBloomPath);
            }
        }
    } catch (Exception e) {
        throw new RuntimeException("Failed to create bloom filter!", e);
    }
}

From source file:com.liveramp.hank.cascading.DomainBuilderTap.java

License:Apache License

public void sinkConfInit(FlowProcess<JobConf> process, JobConf conf) {
    super.sinkConfInit(process, conf);
    // Output Format
    conf.setOutputFormat(this.outputFormatClass);
    // Output Committer
    conf.setOutputCommitter(DomainBuilderOutputCommitter.class);
    // Set this tap's Domain name locally in the conf
    if (conf.get(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME) != null) {
        throw new RuntimeException("Trying to set domain name configuration parameter to " + domainName
                + " but it was previously set to "
                + conf.get(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME));
    } else {//from  www.  j  a v  a  2  s. com
        conf.set(DomainBuilderAbstractOutputFormat.CONF_PARAM_HANK_DOMAIN_NAME, domainName);
    }
}