List of usage examples for org.apache.hadoop.mapred JobConf setInt
public void setInt(String name, int value)
name
property to an int
. From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.HashIdMR.java
License:Open Source License
/** * @param inputpath/*from ww w . j a v a2 s . com*/ * the path to a unique vertex list. Each line is parsed into (vid, * data) using {@code vidparser} and {@code vdataparser}. * @param outputpath * the path of output directory. * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(HashIdMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(HashIdMapper.class); conf.setReducerClass(HashIdReducer.class); conf.setInputFormat(NLineInputFormat.class); conf.setOutputFormat(MultiDirOutputFormat.class); conf.setInt("mapred.line.input.format.linespermap", linespermap); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("VdataParser", vdataparser.getClass().getName()); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("====== Job: Create integer Id maps for vertices =========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.debug("Lines per map = 6000000"); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("VdataParser = " + vdataparser.getClass().getName()); LOG.info("=========================================================="); JobClient.runJob(conf); LOG.info("=======================Done =====================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortDictMR.java
License:Open Source License
/** * @param inputpath//from w w w . java 2 s . co m * the path to a rawId to newId dictionary. * @param outputpath * the path of output directory. * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(SortDictMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(SortDictMapper.class); conf.setReducerClass(SortDictReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setBoolean("hashRawVid", hashRawVid); conf.setInt("numChunks", numChunks); conf.set("VidParser", vidparser.getClass().getName()); String outprefix = "vidhashmap"; for (int i = 0; i < numChunks; i++) { MultipleOutputs.addNamedOutput(conf, outprefix + i, TextOutputFormat.class, Text.class, Text.class); } FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("========== Job: Partition the map of rawid -> id ==========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.info("======================================================"); if (hashRawVid) LOG.info("Partition on rawId."); else LOG.info("Partition on newId"); LOG.debug("numChunks = " + numChunks); LOG.debug("VidParser = " + vidparser.getClass().getName()); JobClient.runJob(conf); LOG.info("======================= Done ==========================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.SortEdgeMR.java
License:Open Source License
public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(SortEdgeMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(SortEdgeMapper.class); conf.setReducerClass(SortEdgeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInt("numChunks", numChunks); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("EdataParser", edataparser.getClass().getName()); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("==== Job: Partition the input edges by hash(sourceid) ========="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.debug("numChunks = " + numChunks); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("EdataParser = " + edataparser.getClass().getName()); LOG.info("==============================================================="); JobClient.runJob(conf);/*w w w . ja va 2 s. c o m*/ LOG.info("=================== Done ====================================\n"); }
From source file:com.intel.hadoop.graphbuilder.idnormalize.mapreduce.TransEdgeMR.java
License:Open Source License
/** * @param inputpath//from ww w .j a v a 2 s. c o m * path of the partitioned edge list * @param outputpath * path of the output directory * @throws IOException */ public void run(String inputpath, String outputpath) throws IOException { JobConf conf = new JobConf(TransEdgeMR.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setMapperClass(TransEdgeMapper.class); conf.setReducerClass(TransEdgeReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setInt("numChunks", numChunks); conf.set("GraphParser", graphparser.getClass().getName()); conf.set("VidParser", vidparser.getClass().getName()); conf.set("EdataParser", edataparser.getClass().getName()); conf.set("dictionaryPath", dictionaryPath); FileInputFormat.setInputPaths(conf, new Path(inputpath)); FileOutputFormat.setOutputPath(conf, new Path(outputpath)); LOG.info("============= Job: Normalize Ids in Edges ===================="); LOG.info("Input = " + inputpath); LOG.info("Output = " + outputpath); LOG.info("Dictionary = " + dictionaryPath); LOG.debug("numChunks = " + numChunks); LOG.debug("GraphParser = " + graphparser.getClass().getName()); LOG.debug("VidParser = " + vidparser.getClass().getName()); LOG.debug("EdataParser = " + edataparser.getClass().getName()); LOG.info("==============================================================="); JobClient.runJob(conf); LOG.info("========================= Done ==============================="); }
From source file:com.kadwa.hadoop.DistExec.java
License:Open Source License
/** * Initialize ExecFilesMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments/*from ww w . ja v a 2 s . c o m*/ * @return true if it is necessary to launch a job. */ private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); jobConf.set(EXEC_CMD_LABEL, args.execCmd); //set boolean values jobConf.setBoolean(Options.REDIRECT_ERROR_TO_OUT.propertyname, args.flags.contains(Options.REDIRECT_ERROR_TO_OUT)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path stagingArea; try { stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf); } catch (InterruptedException e) { throw new IOException(e); } Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId); FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION); FileSystem.mkdirs(FileSystem.get(jobDirectory.toUri(), conf), jobDirectory, mapredSysPerms); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_" + NAME + "_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_" + NAME + "_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_" + NAME + "_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_" + NAME + "_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory. final boolean special = (args.srcs.size() == 1 && !dstExists); int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_" + NAME + "_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_" + NAME + "_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("sourcePathsCount=" + srcCount); LOG.info("filesToExecCount=" + fileCount); LOG.info("bytesToExecCount=" + StringUtils.humanReadableInt(byteCount)); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(fileCount, jobConf); return fileCount > 0; }
From source file:com.linkedin.mlease.regression.jobs.ItemModelTest.java
License:Open Source License
@Override public void run() throws Exception { JobConfig props = super.getJobConfig(); List<String> lambdastr = props.getStringList(LAMBDA, ","); String outBasePath = props.getString(OUTPUT_BASE_PATH); for (String lambda : lambdastr) { String outPath = outBasePath + "/lambda-" + lambda; props.put("output.path", outPath); JobConf conf = createJobConf(PerItemTestMapper.class, PerItemTestReducer.class); AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(props.get(MODEL_PATH)), MODEL_PATH); conf.set(ITEM_KEY, props.getString(ITEM_KEY)); conf.setFloat(LAMBDA, Float.parseFloat(lambda)); conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false)); conf.setPartitionerClass(PerItemTestPartitioner.class); conf.setInt(NUM_REDUCERS, conf.getNumReduceTasks()); AvroUtils.runAvroJob(conf);/*from w w w .j a v a 2 s . co m*/ } }
From source file:com.linkedin.mlease.regression.jobs.RegressionAdmmTrain.java
License:Open Source License
@Override public void run() throws Exception { _logger.info("Now running Regression Train using ADMM..."); JobConfig props = super.getJobConfig(); String outBasePath = props.getString(OUTPUT_BASE_PATH); JobConf conf = super.createJobConf(); // Various configs int nblocks = props.getInt(NUM_BLOCKS); int niter = props.getInt(NUM_ITERS, 10); //Aggressive decay of liblinear_epsilon boolean aggressiveLiblinearEpsilonDecay = props.getBoolean(AGGRESSIVE_LIBLINEAR_EPSILON_DECAY, false); // Getting the value of the regularizer L1/L2 int reg = props.getInt(REGULARIZER); if ((reg != 1) && (reg != 2)) { throw new IOException("Only L1 and L2 regularization supported!"); }/*from w ww.jav a2s.com*/ int numClickReplicates = props.getInt(NUM_CLICK_REPLICATES, 1); boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false); float initializeBoostRate = props.getFloat(INITIALIZE_BOOST_RATE, 0); float rhoAdaptCoefficient = props.getFloat(RHO_ADAPT_COEFFICIENT, 0); // handling lambda and rho // initialize z and u and compute z-u and write to hadoop Map<String, LinearModel> z = new HashMap<String, LinearModel>(); // lambda -> List<String> lambdastr = props.getStringList(LAMBDA, ","); List<String> rhostr = props.getStringList(RHO, null, ","); if (rhostr != null) { if (rhostr.size() != lambdastr.size()) throw new IOException( "The number of rho's should be exactly the same as the number of lambda's. OR: don't claim rho!"); } Map<Float, Float> lambdaRho = new HashMap<Float, Float>(); for (int j = 0; j < lambdastr.size(); j++) { float lambda = Float.parseFloat(lambdastr.get(j)); float rho; if (rhostr != null) { rho = Float.parseFloat(rhostr.get(j)); } else { if (lambda <= 100) { rho = 1; } else { rho = 10; } } lambdaRho.put(lambda, rho); z.put(String.valueOf(lambda), new LinearModel()); } // Get specific lambda treatment for some features String lambdaMapPath = props.getString(LAMBDA_MAP, ""); Map<String, Float> lambdaMap = new HashMap<String, Float>(); if (!lambdaMapPath.equals("")) { AvroHdfsFileReader reader = new AvroHdfsFileReader(conf); ReadLambdaMapConsumer consumer = new ReadLambdaMapConsumer(); reader.build(lambdaMapPath, consumer); consumer.done(); lambdaMap = consumer.get(); } _logger.info("Lambda Map has size = " + String.valueOf(lambdaMap.size())); // Write lambda_rho mapping into file String rhoPath = outBasePath + "/lambda-rho/part-r-00000.avro"; writeLambdaRho(conf, rhoPath, lambdaRho); // test-loglik computation boolean testLoglikPerIter = props.getBoolean(TEST_LOGLIK_PER_ITER, false); DataFileWriter<GenericRecord> testRecordWriter = null; // test if the test file exists String testPath = props.getString(TEST_PATH, ""); testLoglikPerIter = Util.checkPath(testPath); if (testLoglikPerIter) { List<Path> testPathList = AvroUtils.enumerateFiles(conf, new Path(testPath)); if (testPathList.size() > 0) { testPath = testPathList.get(0).toString(); _logger.info("Sample test path = " + testPath); AvroHdfsFileWriter<GenericRecord> writer = new AvroHdfsFileWriter<GenericRecord>(conf, outBasePath + "/sample-test-loglik/write-test-00000.avro", SampleTestLoglik.SCHEMA$); testRecordWriter = writer.get(); } } if (testRecordWriter == null) { testLoglikPerIter = false; _logger.info( "test.loglik.per.iter=false or test path doesn't exist or is empty! So we will not output test loglik per iteration."); } else { testRecordWriter.close(); } MutableFloat bestTestLoglik = new MutableFloat(-9999999); //Initialize z by mean model if (initializeBoostRate > 0 && reg == 2) { _logger.info("Now start mean model initializing......"); // Different paths for L1 vs L2 set from job file String initalModelPath; initalModelPath = outBasePath + "/initialModel"; Path initalModelPathFromNaiveTrain = new Path(outBasePath, "models"); JobConfig propsIni = JobConfig.clone(props); if (!propsIni.containsKey(LIBLINEAR_EPSILON)) { propsIni.put(LIBLINEAR_EPSILON, 0.01); } propsIni.put(RegressionNaiveTrain.HEAVY_PER_ITEM_TRAIN, "true"); propsIni.put(LAMBDA_MAP, lambdaMapPath); propsIni.put(REMOVE_TMP_DIR, "false"); // run job RegressionNaiveTrain initializationJob = new RegressionNaiveTrain( super.getJobId() + "_ADMMInitialization", propsIni); initializationJob.run(); FileSystem fs = initalModelPathFromNaiveTrain.getFileSystem(conf); if (fs.exists(new Path(initalModelPath))) { fs.delete(new Path(initalModelPath), true); } fs.rename(initalModelPathFromNaiveTrain, new Path(initalModelPath)); // set up lambda Set<Float> lambdaSet = new HashSet<Float>(); for (String l : lambdastr) { lambdaSet.add(Float.parseFloat(l)); } // Compute Mean model as initial model z = LinearModelUtils.meanModel(conf, initalModelPath, nblocks, lambdaSet.size(), true); if (testLoglikPerIter) { updateLogLikBestModel(conf, 0, z, testPath, ignoreValue, bestTestLoglik, outBasePath, numClickReplicates); } } double mindiff = 99999999; float liblinearEpsilon = 0.01f; int i; for (i = 1; i <= niter; i++) { _logger.info("Now starting iteration " + String.valueOf(i)); // set up configuration props.put(AbstractAvroJob.OUTPUT_PATH, outBasePath + "/iter-" + String.valueOf(i)); conf = createJobConf(AdmmMapper.class, AdmmReducer.class, Pair.getPairSchema(Schema.create(Type.INT), RegressionPrepareOutput.SCHEMA$), RegressionTrainOutput.SCHEMA$); conf.setPartitionerClass(AdmmPartitioner.class); //AvroUtils.setSpecificReducerInput(conf, true); conf.setInt(NUM_BLOCKS, nblocks); //Added for L1/L2 conf.setInt(REGULARIZER, reg); conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000)); //boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false); conf.setBoolean(BINARY_FEATURE, ignoreValue); conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false)); boolean penalizeIntercept = props.getBoolean(PENALIZE_INTERCEPT, false); String interceptKey = props.getString(INTERCEPT_KEY, LibLinearDataset.INTERCEPT_NAME); conf.set(INTERCEPT_KEY, interceptKey); //int schemaType = props.getInt(SCHEMA_TYPE, 1); // compute and store u into file // u = uplusx - z String uPath = outBasePath + "/iter-" + String.valueOf(i) + "/u/part-r-00000.avro"; if (i == 1) { LinearModelUtils.writeLinearModel(conf, uPath, new HashMap<String, LinearModel>()); if (initializeBoostRate > 0 && reg == 2) { conf.setFloat(RHO_ADAPT_RATE, initializeBoostRate); } } else { String uplusxPath = outBasePath + "/iter-" + String.valueOf(i - 1) + "/model"; computeU(conf, uPath, uplusxPath, z); if (rhoAdaptCoefficient > 0) { float curRhoAdaptRate = (float) Math.exp(-(i - 1) * rhoAdaptCoefficient); conf.setFloat(RHO_ADAPT_RATE, curRhoAdaptRate); } } // write z into file String zPath = outBasePath + "/iter-" + String.valueOf(i) + "/init-value/part-r-00000.avro"; LinearModelUtils.writeLinearModel(conf, zPath, z); // run job String outpath = outBasePath + "/iter-" + String.valueOf(i) + "/model"; conf.set(U_PATH, uPath); conf.set(INIT_VALUE_PATH, zPath); conf.set(LAMBDA_RHO_MAP, rhoPath); if (i > 1 && mindiff < 0.001 && !aggressiveLiblinearEpsilonDecay) // need to get a more accurate estimate from liblinear { liblinearEpsilon = liblinearEpsilon / 10; } else if (aggressiveLiblinearEpsilonDecay && i > 5) { liblinearEpsilon = liblinearEpsilon / 10; } conf.setFloat(LIBLINEAR_EPSILON, liblinearEpsilon); //Added for logging aggressive decay _logger.info("Liblinear Epsilon for iter = " + String.valueOf(i) + " is: " + String.valueOf(liblinearEpsilon)); _logger.info("aggressiveLiblinearEpsilonDecay=" + aggressiveLiblinearEpsilonDecay); AvroOutputFormat.setOutputPath(conf, new Path(outpath)); AvroUtils.addAvroCacheFiles(conf, new Path(uPath)); AvroUtils.addAvroCacheFiles(conf, new Path(zPath)); AvroUtils.addAvroCacheFiles(conf, new Path(rhoPath)); conf.setNumReduceTasks(nblocks * lambdastr.size()); AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$); AvroUtils.runAvroJob(conf); // Load the result from the last iteration // compute z and u given x Map<String, LinearModel> xbar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaRho.size(), true); Map<String, LinearModel> ubar = LinearModelUtils.meanModel(conf, uPath, nblocks, lambdaRho.size(), false); Map<String, LinearModel> lastz = new HashMap<String, LinearModel>(); for (String k : z.keySet()) { lastz.put(k, z.get(k).copy()); } for (String lambda : xbar.keySet()) { LinearModel thisz = z.get(lambda); thisz.clear(); float l = Float.parseFloat(lambda); float r = lambdaRho.get(l); double weight; //L2 regularization if (reg == 2) { _logger.info("Running code for regularizer = " + String.valueOf(reg)); weight = nblocks * r / (l + nblocks * r); Map<String, Double> weightmap = new HashMap<String, Double>(); for (String k : lambdaMap.keySet()) { weightmap.put(k, nblocks * r / (lambdaMap.get(k) + nblocks * r + 0.0)); } thisz.linearCombine(1.0, weight, xbar.get(lambda), weightmap); if (!ubar.isEmpty()) { thisz.linearCombine(1.0, weight, ubar.get(lambda), weightmap); } if (!penalizeIntercept) { if (ubar.isEmpty()) { thisz.setIntercept(xbar.get(lambda).getIntercept()); } else { thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept()); } } z.put(lambda, thisz); } else { // L1 regularization _logger.info("Running code for regularizer = " + String.valueOf(reg)); weight = l / (r * nblocks + 0.0); Map<String, Double> weightmap = new HashMap<String, Double>(); for (String k : lambdaMap.keySet()) { weightmap.put(k, lambdaMap.get(k) / (r * nblocks + 0.0)); } // LinearModel thisz = new LinearModel(); thisz.linearCombine(1.0, 1.0, xbar.get(lambda)); if (!ubar.isEmpty()) { thisz.linearCombine(1.0, 1.0, ubar.get(lambda)); } // Iterative Thresholding Map<String, Double> thisCoefficients = thisz.getCoefficients(); for (String k : thisCoefficients.keySet()) { double val = thisCoefficients.get(k); if (val > weight) { thisCoefficients.put(k, val - weight); } else if (val < -weight) { thisCoefficients.put(k, val + weight); } } thisz.setCoefficients(thisCoefficients); if (!penalizeIntercept) { if (ubar.isEmpty()) { thisz.setIntercept(xbar.get(lambda).getIntercept()); } else { thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept()); } } z.put(lambda, thisz); } } xbar.clear(); ubar.clear(); // Output max difference between last z and this z mindiff = 99999999; double maxdiff = 0; for (String k : z.keySet()) { LinearModel tmp = lastz.get(k); if (tmp == null) tmp = new LinearModel(); tmp.linearCombine(1, -1, z.get(k)); double diff = tmp.maxAbsValue(); _logger.info( "For lambda=" + k + ": Max Difference between last z and this z = " + String.valueOf(diff)); tmp.clear(); if (mindiff > diff) mindiff = diff; if (maxdiff < diff) maxdiff = diff; } double epsilon = props.getDouble(EPSILON, 0.0001); // remove tmp files? if (props.getBoolean(REMOVE_TMP_DIR, false) && i >= 2) { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(outBasePath + "/iter-" + String.valueOf(i - 1)), true); } // Output testloglik and update best model if (testLoglikPerIter) { updateLogLikBestModel(conf, i, z, testPath, ignoreValue, bestTestLoglik, outBasePath, numClickReplicates); } if (maxdiff < epsilon && liblinearEpsilon <= 0.00001) { break; } } // write z into file String zPath = outBasePath + "/final-model/part-r-00000.avro"; LinearModelUtils.writeLinearModel(conf, zPath, z); // remove tmp files? if (props.getBoolean(REMOVE_TMP_DIR, false)) { FileSystem fs = FileSystem.get(conf); Path initalModelPath = new Path(outBasePath + "/initialModel"); if (fs.exists(initalModelPath)) { fs.delete(initalModelPath, true); } for (int j = i - 2; j <= i; j++) { Path deletepath = new Path(outBasePath + "/iter-" + String.valueOf(j)); if (fs.exists(deletepath)) { fs.delete(deletepath, true); } } fs.delete(new Path(outBasePath + "/tmp-data"), true); } }
From source file:com.linkedin.mlease.regression.jobs.RegressionNaiveTrain.java
License:Open Source License
@Override public void run() throws Exception { JobConfig props = super.getJobConfig(); String outBasePath = props.getString(OUTPUT_BASE_PATH); boolean heavyPerItemTrain = props.getBoolean(HEAVY_PER_ITEM_TRAIN, false); String partitionIdPath = ""; if (heavyPerItemTrain) { partitionIdPath = outBasePath + "/partitionIds"; props.put(AbstractAvroJob.OUTPUT_PATH, partitionIdPath); JobConf conf = createJobConf(PartitionIdAssignerMapper.class, PartitionIdAssignerReducer.class, PartitionIdAssignerCombiner.class, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)), Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); conf.set(LAMBDA, props.getString(LAMBDA)); AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$); conf.setNumReduceTasks(1);//from w ww . j a va2s .c o m AvroUtils.runAvroJob(conf); } _logger.info("Start training per-key naive logistic regression model..."); String outpath = outBasePath + "/models"; props.put(AbstractAvroJob.OUTPUT_PATH, outpath); JobConf conf = createJobConf(NaiveMapper.class, NaiveReducer.class, Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$), LinearModelAvro.SCHEMA$); // set up conf boolean computeModelMean = props.getBoolean(COMPUTE_MODEL_MEAN, true); int nblocks = -1; if (computeModelMean) { nblocks = props.getInt(NUM_BLOCKS); conf.setInt(NUM_BLOCKS, nblocks); } List<String> lambdastr = props.getStringList(LAMBDA, ","); conf.set(LAMBDA, props.getString(LAMBDA)); conf.setFloat(PRIOR_MEAN, props.getFloat(PRIOR_MEAN, 0.0)); conf.setBoolean(PENALIZE_INTERCEPT, props.getBoolean(PENALIZE_INTERCEPT, false)); conf.setBoolean(HAS_INTERCEPT, props.getBoolean(HAS_INTERCEPT, true)); conf.set(INTERCEPT_KEY, props.getString(INTERCEPT_KEY, LIBLINEAR_INTERCEPT_KEY)); conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000)); boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true); conf.setFloat(LIBLINEAR_EPSILON, props.getFloat(LIBLINEAR_EPSILON, 0.001f)); String lambdaMap = props.getString(LAMBDA_MAP, ""); conf.set(LAMBDA_MAP, lambdaMap); if (!lambdaMap.equals("")) { AvroUtils.addAvroCacheFiles(conf, new Path(lambdaMap)); } conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false)); conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false)); // set up lambda Set<Float> lambdaSet = new HashSet<Float>(); for (String l : lambdastr) { lambdaSet.add(Float.parseFloat(l)); } conf.setInt(DATA_SIZE_THRESHOLD, props.getInt(DATA_SIZE_THRESHOLD, 0)); // set up partition id if (heavyPerItemTrain && !partitionIdPath.equals("")) { conf.set(PARTITION_ID_PATH, partitionIdPath); AvroHdfsFileReader reader = new AvroHdfsFileReader(conf); ReadPartitionIdAssignmentConsumer consumer = new ReadPartitionIdAssignmentConsumer(); reader.build(partitionIdPath, consumer); Map<String, Integer> partitionIdMap = consumer.get(); int maxPartitionId = 0; for (int v : partitionIdMap.values()) { if (v > maxPartitionId) { maxPartitionId = v; } } AvroUtils.addAvroCacheFiles(conf, new Path(partitionIdPath)); conf.setNumReduceTasks(maxPartitionId + 1); conf.setPartitionerClass(NaivePartitioner.class); } // run job AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$); AvroUtils.runAvroJob(conf); // Compute Mean if (computeModelMean) { Map<String, LinearModel> betabar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaSet.size(), true); // Output the mean for each lambda // write z into file String finalOutPath = outBasePath + "/final-model/part-r-00000.avro"; LinearModelUtils.writeLinearModel(conf, finalOutPath, betabar); } // remove tmp dir if (removeTmpDir) { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(outBasePath + "/tmp-data"), true); } }
From source file:com.linkedin.mlease.regression.jobs.RegressionPrepare.java
License:Open Source License
@Override public void run() throws Exception { JobConfig config = super.getJobConfig(); JobConf conf = super.createJobConf(RegressionPrepareMapper.class, RegressionPrepareOutput.SCHEMA$); String mapKey = config.getString(MAP_KEY, ""); conf.set(MAP_KEY, mapKey);// www . j a va 2 s. com conf.setInt(NUM_CLICK_REPLICATES, config.getInt(NUM_CLICK_REPLICATES, 1)); conf.setBoolean(IGNORE_FEATURE_VALUE, config.getBoolean(IGNORE_FEATURE_VALUE, false)); int nblocks = config.getInt(NUM_BLOCKS, 0); conf.setInt(NUM_BLOCKS, nblocks); _logger.info("Running the preparation job of admm with map.key = " + mapKey + " and num.blocks=" + nblocks); AvroUtils.runAvroJob(conf); }
From source file:com.m6d.filecrush.crush.CrushOptionParsingTest.java
License:Apache License
@Before public void before() throws IOException { crush = new Crush(); JobConf job = new JobConf(false); crush.setConf(job);//from w w w . j av a2s . co m job.set("fs.default.name", "file:///"); job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); job.setInt("mapreduce.job.reduces", 20); job.setLong("dfs.blocksize", 1024 * 1024 * 64); FileSystem fs = FileSystem.get(job); fs.setWorkingDirectory(new Path(tmp.getRoot().getAbsolutePath())); crush.setFileSystem(fs); }