List of usage examples for org.apache.hadoop.mapred JobConf setLong
public void setLong(String name, long value)
name
property to a long
. From source file:com.ibm.bi.dml.runtime.transform.ApplyTfBBMR.java
License:Open Source License
public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String specPath, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception { CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst); long[] rlens = new long[] { numRows }; long[] clens = new long[] { numColsAfter }; int[] brlens = new int[] { rblk.brlen }; int[] bclens = new int[] { rblk.bclen }; byte[] realIndexes = new byte[] { rblk.input }; byte[] resultIndexes = new byte[] { rblk.output }; JobConf job = new JobConf(ApplyTfBBMR.class); job.setJobName("ApplyTfBB"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfBBMR.class); // set relevant classes job.setMapperClass(ApplyTfBBMapper.class); MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL); MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); MRJobConfiguration.setCSVReblockInstructions(job, rblkInst); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInst); job.setInt("dfs.replication", replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false); //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getConfig().getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups); job.setNumReduceTasks(numRed);//from w w w. j av a2s . c o m //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false); // configure mapper and the mapper output key value pairs job.setMapperClass(ApplyTfBBMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(new Path(partOffsetsFile), "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specPath); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job); Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } return new JobReturn(ret.stats, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.transform.ApplyTfCSVMR.java
License:Open Source License
public static JobReturn runJob(String inputPath, String specPath, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(ApplyTfCSVMR.class); job.setJobName("ApplyTfCSV"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfCSVMR.class); // set relevant classes job.setMapperClass(ApplyTfCSVMapper.class); job.setNumReduceTasks(0);/*from www . jav a2s . com*/ // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(partOffsetsFile); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInt("dfs.replication", replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(outputPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specPath); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); // Since transform CSV produces part files w/ prefix transform-part-*, // delete all the "default" part-..... files deletePartFiles(fs, outPath); MatrixCharacteristics mc = new MatrixCharacteristics(); return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful()); }
From source file:com.ibm.bi.dml.runtime.transform.GenTfMtdMR.java
License:Open Source License
public static long runJob(String inputPath, String txMtdPath, String specFileWithIDs, String smallestFile, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(GenTfMtdMR.class); job.setJobName("GenTfMTD"); /* Setup MapReduce Job */ job.setJarByClass(GenTfMtdMR.class); // set relevant classes job.setMapperClass(GTFMTDMapper.class); job.setReducerClass(GTFMTDReducer.class); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DistinctValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setInt("dfs.replication", replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(txMtdPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true);//from w w w . ja v a2 s . c o m FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC_FILE, specFileWithIDs); job.set(MRJobConfiguration.TF_SMALLEST_FILE, smallestFile); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, txMtdPath); // offsets file to store part-file names and offsets for each input split job.set(MRJobConfiguration.TF_OFFSETS_FILE, partOffsetsFile); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); Counters c = runjob.getCounters(); long tx_numRows = c.findCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS).getCounter(); return tx_numRows; }
From source file:com.jyz.study.hadoop.mapreduce.datajoin.DataJoinJob.java
License:Apache License
public static JobConf createDataJoinJob(String args[]) throws IOException { String inputDir = args[0];/*from ww w. j av a 2s .c o m*/ String outputDir = args[1]; Class inputFormat = SequenceFileInputFormat.class; if (args[2].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileInputFormat: " + args[2]); } else { System.out.println("Using TextInputFormat: " + args[2]); inputFormat = TextInputFormat.class; } int numOfReducers = Integer.parseInt(args[3]); Class mapper = getClassByName(args[4]); Class reducer = getClassByName(args[5]); Class mapoutputValueClass = getClassByName(args[6]); Class outputFormat = TextOutputFormat.class; Class outputValueClass = Text.class; if (args[7].compareToIgnoreCase("text") != 0) { System.out.println("Using SequenceFileOutputFormat: " + args[7]); outputFormat = SequenceFileOutputFormat.class; outputValueClass = getClassByName(args[7]); } else { System.out.println("Using TextOutputFormat: " + args[7]); } long maxNumOfValuesPerGroup = 100; String jobName = ""; if (args.length > 8) { maxNumOfValuesPerGroup = Long.parseLong(args[8]); } if (args.length > 9) { jobName = args[9]; } Configuration defaults = new Configuration(); JobConf job = new JobConf(defaults, DataJoinJob.class); job.setJobName("DataJoinJob: " + jobName); FileSystem fs = FileSystem.get(defaults); fs.delete(new Path(outputDir), true); FileInputFormat.setInputPaths(job, inputDir); job.setInputFormat(inputFormat); job.setMapperClass(mapper); FileOutputFormat.setOutputPath(job, new Path(outputDir)); job.setOutputFormat(outputFormat); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(mapoutputValueClass); job.setOutputKeyClass(Text.class); job.setOutputValueClass(outputValueClass); job.setReducerClass(reducer); job.setNumMapTasks(1); job.setNumReduceTasks(numOfReducers); job.setLong("datajoin.maxNumOfValuesPerGroup", maxNumOfValuesPerGroup); return job; }
From source file:com.kadwa.hadoop.DistExec.java
License:Open Source License
/** * Initialize ExecFilesMapper specific job-configuration. * * @param conf : The dfs/mapred configuration. * @param jobConf : The handle to the jobConf object to be initialized. * @param args Arguments// w ww. j a v a 2 s . c om * @return true if it is necessary to launch a job. */ private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException { jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString()); jobConf.set(EXEC_CMD_LABEL, args.execCmd); //set boolean values jobConf.setBoolean(Options.REDIRECT_ERROR_TO_OUT.propertyname, args.flags.contains(Options.REDIRECT_ERROR_TO_OUT)); final String randomId = getRandomId(); JobClient jClient = new JobClient(jobConf); Path stagingArea; try { stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf); } catch (InterruptedException e) { throw new IOException(e); } Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId); FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION); FileSystem.mkdirs(FileSystem.get(jobDirectory.toUri(), conf), jobDirectory, mapredSysPerms); jobConf.set(JOB_DIR_LABEL, jobDirectory.toString()); FileSystem dstfs = args.dst.getFileSystem(conf); // get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf); boolean dstExists = dstfs.exists(args.dst); boolean dstIsDir = false; if (dstExists) { dstIsDir = dstfs.getFileStatus(args.dst).isDir(); } // default logPath Path logPath = args.log; if (logPath == null) { String filename = "_" + NAME + "_logs_" + randomId; if (!dstExists || !dstIsDir) { Path parent = args.dst.getParent(); if (!dstfs.exists(parent)) { dstfs.mkdirs(parent); } logPath = new Path(parent, filename); } else { logPath = new Path(args.dst, filename); } } FileOutputFormat.setOutputPath(jobConf, logPath); // create src list, dst list FileSystem jobfs = jobDirectory.getFileSystem(jobConf); Path srcfilelist = new Path(jobDirectory, "_" + NAME + "_src_files"); jobConf.set(SRC_LIST_LABEL, srcfilelist.toString()); SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class, FilePair.class, SequenceFile.CompressionType.NONE); Path dstfilelist = new Path(jobDirectory, "_" + NAME + "_dst_files"); SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class, Text.class, SequenceFile.CompressionType.NONE); Path dstdirlist = new Path(jobDirectory, "_" + NAME + "_dst_dirs"); jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString()); SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class, FilePair.class, SequenceFile.CompressionType.NONE); // handle the case where the destination directory doesn't exist // and we've only a single src directory. final boolean special = (args.srcs.size() == 1 && !dstExists); int srcCount = 0, cnsyncf = 0, dirsyn = 0; long fileCount = 0L, byteCount = 0L, cbsyncs = 0L; try { for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) { final Path src = srcItr.next(); FileSystem srcfs = src.getFileSystem(conf); FileStatus srcfilestat = srcfs.getFileStatus(src); Path root = special && srcfilestat.isDir() ? src : src.getParent(); if (srcfilestat.isDir()) { ++srcCount; } Stack<FileStatus> pathstack = new Stack<FileStatus>(); for (pathstack.push(srcfilestat); !pathstack.empty();) { FileStatus cur = pathstack.pop(); FileStatus[] children = srcfs.listStatus(cur.getPath()); for (int i = 0; i < children.length; i++) { boolean skipfile = false; final FileStatus child = children[i]; final String dst = makeRelative(root, child.getPath()); ++srcCount; if (child.isDir()) { pathstack.push(child); } else { if (!skipfile) { ++fileCount; byteCount += child.getLen(); if (LOG.isTraceEnabled()) { LOG.trace("adding file " + child.getPath()); } ++cnsyncf; cbsyncs += child.getLen(); if (cnsyncf > SYNC_FILE_MAX || cbsyncs > BYTES_PER_MAP) { src_writer.sync(); dst_writer.sync(); cnsyncf = 0; cbsyncs = 0L; } } } if (!skipfile) { src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()), new FilePair(child, dst)); } dst_writer.append(new Text(dst), new Text(child.getPath().toString())); } if (cur.isDir()) { String dst = makeRelative(root, cur.getPath()); dir_writer.append(new Text(dst), new FilePair(cur, dst)); if (++dirsyn > SYNC_FILE_MAX) { dirsyn = 0; dir_writer.sync(); } } } } } finally { checkAndClose(src_writer); checkAndClose(dst_writer); checkAndClose(dir_writer); } FileStatus dststatus = null; try { dststatus = dstfs.getFileStatus(args.dst); } catch (FileNotFoundException fnfe) { LOG.info(args.dst + " does not exist."); } // create dest path dir if copying > 1 file if (dststatus == null) { if (srcCount > 1 && !dstfs.mkdirs(args.dst)) { throw new IOException("Failed to create" + args.dst); } } final Path sorted = new Path(jobDirectory, "_" + NAME + "_sorted"); checkDuplication(jobfs, dstfilelist, sorted, conf); Path tmpDir = new Path( (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst, "_" + NAME + "_tmp_" + randomId); jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString()); LOG.info("sourcePathsCount=" + srcCount); LOG.info("filesToExecCount=" + fileCount); LOG.info("bytesToExecCount=" + StringUtils.humanReadableInt(byteCount)); jobConf.setInt(SRC_COUNT_LABEL, srcCount); jobConf.setLong(TOTAL_SIZE_LABEL, byteCount); setMapCount(fileCount, jobConf); return fileCount > 0; }
From source file:com.linkedin.mlease.regression.jobs.ItemModelTrain.java
License:Open Source License
@Override public void run() throws Exception { JobConfig props = super.getJobConfig(); _logger.info("Start training per-key naive logistic regression model..."); String outBasePath = props.getString(OUTPUT_MODEL_PATH); String outpath = outBasePath + "/models"; props.put("output.path", outpath); JobConf conf = createJobConf(ItemModelTrainMapper.class, ItemModelTrainReducer.class, Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$), LinearModelWithVarAvro.SCHEMA$); // set up conf String interceptPriorMeanMap = props.getString(INTERCEPT_PRIOR_MEAN_MAP, ""); if (!interceptPriorMeanMap.equals("")) { AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(interceptPriorMeanMap), INTERCEPT_PRIOR_MEAN_MAP); }/* www . j a v a 2 s. c o m*/ String lambdaMap = props.getString(LAMBDA_MAP, ""); if (!lambdaMap.equals("")) { AvroUtils.addAvroCacheFilesAndSetTheProperty(conf, new Path(lambdaMap), LAMBDA_MAP); } conf.setFloat(INTERCEPT_DEFAULT_PRIOR_MEAN, (float) props.getDouble(INTERCEPT_DEFAULT_PRIOR_MEAN, 0)); conf.set(INTERCEPT_LAMBDAS, props.get(INTERCEPT_LAMBDAS)); conf.set(DEFAULT_LAMBDAS, props.get(DEFAULT_LAMBDAS)); conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000)); conf.setFloat(LIBLINEAR_EPSILON, (float) props.getDouble(LIBLINEAR_EPSILON, 0.001f)); conf.setBoolean(COMPUTE_VAR, props.getBoolean(COMPUTE_VAR, false)); conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false)); conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false)); // run job AvroUtils.runAvroJob(conf); boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true); if (removeTmpDir) { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(outBasePath + "/tmp-data"), true); } }
From source file:com.linkedin.mlease.regression.jobs.RegressionAdmmTrain.java
License:Open Source License
@Override public void run() throws Exception { _logger.info("Now running Regression Train using ADMM..."); JobConfig props = super.getJobConfig(); String outBasePath = props.getString(OUTPUT_BASE_PATH); JobConf conf = super.createJobConf(); // Various configs int nblocks = props.getInt(NUM_BLOCKS); int niter = props.getInt(NUM_ITERS, 10); //Aggressive decay of liblinear_epsilon boolean aggressiveLiblinearEpsilonDecay = props.getBoolean(AGGRESSIVE_LIBLINEAR_EPSILON_DECAY, false); // Getting the value of the regularizer L1/L2 int reg = props.getInt(REGULARIZER); if ((reg != 1) && (reg != 2)) { throw new IOException("Only L1 and L2 regularization supported!"); }//from w w w. j a va2s. co m int numClickReplicates = props.getInt(NUM_CLICK_REPLICATES, 1); boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false); float initializeBoostRate = props.getFloat(INITIALIZE_BOOST_RATE, 0); float rhoAdaptCoefficient = props.getFloat(RHO_ADAPT_COEFFICIENT, 0); // handling lambda and rho // initialize z and u and compute z-u and write to hadoop Map<String, LinearModel> z = new HashMap<String, LinearModel>(); // lambda -> List<String> lambdastr = props.getStringList(LAMBDA, ","); List<String> rhostr = props.getStringList(RHO, null, ","); if (rhostr != null) { if (rhostr.size() != lambdastr.size()) throw new IOException( "The number of rho's should be exactly the same as the number of lambda's. OR: don't claim rho!"); } Map<Float, Float> lambdaRho = new HashMap<Float, Float>(); for (int j = 0; j < lambdastr.size(); j++) { float lambda = Float.parseFloat(lambdastr.get(j)); float rho; if (rhostr != null) { rho = Float.parseFloat(rhostr.get(j)); } else { if (lambda <= 100) { rho = 1; } else { rho = 10; } } lambdaRho.put(lambda, rho); z.put(String.valueOf(lambda), new LinearModel()); } // Get specific lambda treatment for some features String lambdaMapPath = props.getString(LAMBDA_MAP, ""); Map<String, Float> lambdaMap = new HashMap<String, Float>(); if (!lambdaMapPath.equals("")) { AvroHdfsFileReader reader = new AvroHdfsFileReader(conf); ReadLambdaMapConsumer consumer = new ReadLambdaMapConsumer(); reader.build(lambdaMapPath, consumer); consumer.done(); lambdaMap = consumer.get(); } _logger.info("Lambda Map has size = " + String.valueOf(lambdaMap.size())); // Write lambda_rho mapping into file String rhoPath = outBasePath + "/lambda-rho/part-r-00000.avro"; writeLambdaRho(conf, rhoPath, lambdaRho); // test-loglik computation boolean testLoglikPerIter = props.getBoolean(TEST_LOGLIK_PER_ITER, false); DataFileWriter<GenericRecord> testRecordWriter = null; // test if the test file exists String testPath = props.getString(TEST_PATH, ""); testLoglikPerIter = Util.checkPath(testPath); if (testLoglikPerIter) { List<Path> testPathList = AvroUtils.enumerateFiles(conf, new Path(testPath)); if (testPathList.size() > 0) { testPath = testPathList.get(0).toString(); _logger.info("Sample test path = " + testPath); AvroHdfsFileWriter<GenericRecord> writer = new AvroHdfsFileWriter<GenericRecord>(conf, outBasePath + "/sample-test-loglik/write-test-00000.avro", SampleTestLoglik.SCHEMA$); testRecordWriter = writer.get(); } } if (testRecordWriter == null) { testLoglikPerIter = false; _logger.info( "test.loglik.per.iter=false or test path doesn't exist or is empty! So we will not output test loglik per iteration."); } else { testRecordWriter.close(); } MutableFloat bestTestLoglik = new MutableFloat(-9999999); //Initialize z by mean model if (initializeBoostRate > 0 && reg == 2) { _logger.info("Now start mean model initializing......"); // Different paths for L1 vs L2 set from job file String initalModelPath; initalModelPath = outBasePath + "/initialModel"; Path initalModelPathFromNaiveTrain = new Path(outBasePath, "models"); JobConfig propsIni = JobConfig.clone(props); if (!propsIni.containsKey(LIBLINEAR_EPSILON)) { propsIni.put(LIBLINEAR_EPSILON, 0.01); } propsIni.put(RegressionNaiveTrain.HEAVY_PER_ITEM_TRAIN, "true"); propsIni.put(LAMBDA_MAP, lambdaMapPath); propsIni.put(REMOVE_TMP_DIR, "false"); // run job RegressionNaiveTrain initializationJob = new RegressionNaiveTrain( super.getJobId() + "_ADMMInitialization", propsIni); initializationJob.run(); FileSystem fs = initalModelPathFromNaiveTrain.getFileSystem(conf); if (fs.exists(new Path(initalModelPath))) { fs.delete(new Path(initalModelPath), true); } fs.rename(initalModelPathFromNaiveTrain, new Path(initalModelPath)); // set up lambda Set<Float> lambdaSet = new HashSet<Float>(); for (String l : lambdastr) { lambdaSet.add(Float.parseFloat(l)); } // Compute Mean model as initial model z = LinearModelUtils.meanModel(conf, initalModelPath, nblocks, lambdaSet.size(), true); if (testLoglikPerIter) { updateLogLikBestModel(conf, 0, z, testPath, ignoreValue, bestTestLoglik, outBasePath, numClickReplicates); } } double mindiff = 99999999; float liblinearEpsilon = 0.01f; int i; for (i = 1; i <= niter; i++) { _logger.info("Now starting iteration " + String.valueOf(i)); // set up configuration props.put(AbstractAvroJob.OUTPUT_PATH, outBasePath + "/iter-" + String.valueOf(i)); conf = createJobConf(AdmmMapper.class, AdmmReducer.class, Pair.getPairSchema(Schema.create(Type.INT), RegressionPrepareOutput.SCHEMA$), RegressionTrainOutput.SCHEMA$); conf.setPartitionerClass(AdmmPartitioner.class); //AvroUtils.setSpecificReducerInput(conf, true); conf.setInt(NUM_BLOCKS, nblocks); //Added for L1/L2 conf.setInt(REGULARIZER, reg); conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000)); //boolean ignoreValue = props.getBoolean(BINARY_FEATURE, false); conf.setBoolean(BINARY_FEATURE, ignoreValue); conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false)); boolean penalizeIntercept = props.getBoolean(PENALIZE_INTERCEPT, false); String interceptKey = props.getString(INTERCEPT_KEY, LibLinearDataset.INTERCEPT_NAME); conf.set(INTERCEPT_KEY, interceptKey); //int schemaType = props.getInt(SCHEMA_TYPE, 1); // compute and store u into file // u = uplusx - z String uPath = outBasePath + "/iter-" + String.valueOf(i) + "/u/part-r-00000.avro"; if (i == 1) { LinearModelUtils.writeLinearModel(conf, uPath, new HashMap<String, LinearModel>()); if (initializeBoostRate > 0 && reg == 2) { conf.setFloat(RHO_ADAPT_RATE, initializeBoostRate); } } else { String uplusxPath = outBasePath + "/iter-" + String.valueOf(i - 1) + "/model"; computeU(conf, uPath, uplusxPath, z); if (rhoAdaptCoefficient > 0) { float curRhoAdaptRate = (float) Math.exp(-(i - 1) * rhoAdaptCoefficient); conf.setFloat(RHO_ADAPT_RATE, curRhoAdaptRate); } } // write z into file String zPath = outBasePath + "/iter-" + String.valueOf(i) + "/init-value/part-r-00000.avro"; LinearModelUtils.writeLinearModel(conf, zPath, z); // run job String outpath = outBasePath + "/iter-" + String.valueOf(i) + "/model"; conf.set(U_PATH, uPath); conf.set(INIT_VALUE_PATH, zPath); conf.set(LAMBDA_RHO_MAP, rhoPath); if (i > 1 && mindiff < 0.001 && !aggressiveLiblinearEpsilonDecay) // need to get a more accurate estimate from liblinear { liblinearEpsilon = liblinearEpsilon / 10; } else if (aggressiveLiblinearEpsilonDecay && i > 5) { liblinearEpsilon = liblinearEpsilon / 10; } conf.setFloat(LIBLINEAR_EPSILON, liblinearEpsilon); //Added for logging aggressive decay _logger.info("Liblinear Epsilon for iter = " + String.valueOf(i) + " is: " + String.valueOf(liblinearEpsilon)); _logger.info("aggressiveLiblinearEpsilonDecay=" + aggressiveLiblinearEpsilonDecay); AvroOutputFormat.setOutputPath(conf, new Path(outpath)); AvroUtils.addAvroCacheFiles(conf, new Path(uPath)); AvroUtils.addAvroCacheFiles(conf, new Path(zPath)); AvroUtils.addAvroCacheFiles(conf, new Path(rhoPath)); conf.setNumReduceTasks(nblocks * lambdastr.size()); AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$); AvroUtils.runAvroJob(conf); // Load the result from the last iteration // compute z and u given x Map<String, LinearModel> xbar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaRho.size(), true); Map<String, LinearModel> ubar = LinearModelUtils.meanModel(conf, uPath, nblocks, lambdaRho.size(), false); Map<String, LinearModel> lastz = new HashMap<String, LinearModel>(); for (String k : z.keySet()) { lastz.put(k, z.get(k).copy()); } for (String lambda : xbar.keySet()) { LinearModel thisz = z.get(lambda); thisz.clear(); float l = Float.parseFloat(lambda); float r = lambdaRho.get(l); double weight; //L2 regularization if (reg == 2) { _logger.info("Running code for regularizer = " + String.valueOf(reg)); weight = nblocks * r / (l + nblocks * r); Map<String, Double> weightmap = new HashMap<String, Double>(); for (String k : lambdaMap.keySet()) { weightmap.put(k, nblocks * r / (lambdaMap.get(k) + nblocks * r + 0.0)); } thisz.linearCombine(1.0, weight, xbar.get(lambda), weightmap); if (!ubar.isEmpty()) { thisz.linearCombine(1.0, weight, ubar.get(lambda), weightmap); } if (!penalizeIntercept) { if (ubar.isEmpty()) { thisz.setIntercept(xbar.get(lambda).getIntercept()); } else { thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept()); } } z.put(lambda, thisz); } else { // L1 regularization _logger.info("Running code for regularizer = " + String.valueOf(reg)); weight = l / (r * nblocks + 0.0); Map<String, Double> weightmap = new HashMap<String, Double>(); for (String k : lambdaMap.keySet()) { weightmap.put(k, lambdaMap.get(k) / (r * nblocks + 0.0)); } // LinearModel thisz = new LinearModel(); thisz.linearCombine(1.0, 1.0, xbar.get(lambda)); if (!ubar.isEmpty()) { thisz.linearCombine(1.0, 1.0, ubar.get(lambda)); } // Iterative Thresholding Map<String, Double> thisCoefficients = thisz.getCoefficients(); for (String k : thisCoefficients.keySet()) { double val = thisCoefficients.get(k); if (val > weight) { thisCoefficients.put(k, val - weight); } else if (val < -weight) { thisCoefficients.put(k, val + weight); } } thisz.setCoefficients(thisCoefficients); if (!penalizeIntercept) { if (ubar.isEmpty()) { thisz.setIntercept(xbar.get(lambda).getIntercept()); } else { thisz.setIntercept(xbar.get(lambda).getIntercept() + ubar.get(lambda).getIntercept()); } } z.put(lambda, thisz); } } xbar.clear(); ubar.clear(); // Output max difference between last z and this z mindiff = 99999999; double maxdiff = 0; for (String k : z.keySet()) { LinearModel tmp = lastz.get(k); if (tmp == null) tmp = new LinearModel(); tmp.linearCombine(1, -1, z.get(k)); double diff = tmp.maxAbsValue(); _logger.info( "For lambda=" + k + ": Max Difference between last z and this z = " + String.valueOf(diff)); tmp.clear(); if (mindiff > diff) mindiff = diff; if (maxdiff < diff) maxdiff = diff; } double epsilon = props.getDouble(EPSILON, 0.0001); // remove tmp files? if (props.getBoolean(REMOVE_TMP_DIR, false) && i >= 2) { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(outBasePath + "/iter-" + String.valueOf(i - 1)), true); } // Output testloglik and update best model if (testLoglikPerIter) { updateLogLikBestModel(conf, i, z, testPath, ignoreValue, bestTestLoglik, outBasePath, numClickReplicates); } if (maxdiff < epsilon && liblinearEpsilon <= 0.00001) { break; } } // write z into file String zPath = outBasePath + "/final-model/part-r-00000.avro"; LinearModelUtils.writeLinearModel(conf, zPath, z); // remove tmp files? if (props.getBoolean(REMOVE_TMP_DIR, false)) { FileSystem fs = FileSystem.get(conf); Path initalModelPath = new Path(outBasePath + "/initialModel"); if (fs.exists(initalModelPath)) { fs.delete(initalModelPath, true); } for (int j = i - 2; j <= i; j++) { Path deletepath = new Path(outBasePath + "/iter-" + String.valueOf(j)); if (fs.exists(deletepath)) { fs.delete(deletepath, true); } } fs.delete(new Path(outBasePath + "/tmp-data"), true); } }
From source file:com.linkedin.mlease.regression.jobs.RegressionNaiveTrain.java
License:Open Source License
@Override public void run() throws Exception { JobConfig props = super.getJobConfig(); String outBasePath = props.getString(OUTPUT_BASE_PATH); boolean heavyPerItemTrain = props.getBoolean(HEAVY_PER_ITEM_TRAIN, false); String partitionIdPath = ""; if (heavyPerItemTrain) { partitionIdPath = outBasePath + "/partitionIds"; props.put(AbstractAvroJob.OUTPUT_PATH, partitionIdPath); JobConf conf = createJobConf(PartitionIdAssignerMapper.class, PartitionIdAssignerReducer.class, PartitionIdAssignerCombiner.class, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)), Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT))); conf.set(LAMBDA, props.getString(LAMBDA)); AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$); conf.setNumReduceTasks(1);/*w w w.j a va 2 s .c om*/ AvroUtils.runAvroJob(conf); } _logger.info("Start training per-key naive logistic regression model..."); String outpath = outBasePath + "/models"; props.put(AbstractAvroJob.OUTPUT_PATH, outpath); JobConf conf = createJobConf(NaiveMapper.class, NaiveReducer.class, Pair.getPairSchema(Schema.create(Type.STRING), RegressionPrepareOutput.SCHEMA$), LinearModelAvro.SCHEMA$); // set up conf boolean computeModelMean = props.getBoolean(COMPUTE_MODEL_MEAN, true); int nblocks = -1; if (computeModelMean) { nblocks = props.getInt(NUM_BLOCKS); conf.setInt(NUM_BLOCKS, nblocks); } List<String> lambdastr = props.getStringList(LAMBDA, ","); conf.set(LAMBDA, props.getString(LAMBDA)); conf.setFloat(PRIOR_MEAN, props.getFloat(PRIOR_MEAN, 0.0)); conf.setBoolean(PENALIZE_INTERCEPT, props.getBoolean(PENALIZE_INTERCEPT, false)); conf.setBoolean(HAS_INTERCEPT, props.getBoolean(HAS_INTERCEPT, true)); conf.set(INTERCEPT_KEY, props.getString(INTERCEPT_KEY, LIBLINEAR_INTERCEPT_KEY)); conf.setLong(REPORT_FREQUENCY, props.getLong(REPORT_FREQUENCY, 1000000)); boolean removeTmpDir = props.getBoolean(REMOVE_TMP_DIR, true); conf.setFloat(LIBLINEAR_EPSILON, props.getFloat(LIBLINEAR_EPSILON, 0.001f)); String lambdaMap = props.getString(LAMBDA_MAP, ""); conf.set(LAMBDA_MAP, lambdaMap); if (!lambdaMap.equals("")) { AvroUtils.addAvroCacheFiles(conf, new Path(lambdaMap)); } conf.setBoolean(BINARY_FEATURE, props.getBoolean(BINARY_FEATURE, false)); conf.setBoolean(SHORT_FEATURE_INDEX, props.getBoolean(SHORT_FEATURE_INDEX, false)); // set up lambda Set<Float> lambdaSet = new HashSet<Float>(); for (String l : lambdastr) { lambdaSet.add(Float.parseFloat(l)); } conf.setInt(DATA_SIZE_THRESHOLD, props.getInt(DATA_SIZE_THRESHOLD, 0)); // set up partition id if (heavyPerItemTrain && !partitionIdPath.equals("")) { conf.set(PARTITION_ID_PATH, partitionIdPath); AvroHdfsFileReader reader = new AvroHdfsFileReader(conf); ReadPartitionIdAssignmentConsumer consumer = new ReadPartitionIdAssignmentConsumer(); reader.build(partitionIdPath, consumer); Map<String, Integer> partitionIdMap = consumer.get(); int maxPartitionId = 0; for (int v : partitionIdMap.values()) { if (v > maxPartitionId) { maxPartitionId = v; } } AvroUtils.addAvroCacheFiles(conf, new Path(partitionIdPath)); conf.setNumReduceTasks(maxPartitionId + 1); conf.setPartitionerClass(NaivePartitioner.class); } // run job AvroJob.setInputSchema(conf, RegressionPrepareOutput.SCHEMA$); AvroUtils.runAvroJob(conf); // Compute Mean if (computeModelMean) { Map<String, LinearModel> betabar = LinearModelUtils.meanModel(conf, outpath, nblocks, lambdaSet.size(), true); // Output the mean for each lambda // write z into file String finalOutPath = outBasePath + "/final-model/part-r-00000.avro"; LinearModelUtils.writeLinearModel(conf, finalOutPath, betabar); } // remove tmp dir if (removeTmpDir) { FileSystem fs = FileSystem.get(conf); fs.delete(new Path(outBasePath + "/tmp-data"), true); } }
From source file:com.m6d.filecrush.crush.CrushOptionParsingTest.java
License:Apache License
@Before public void before() throws IOException { crush = new Crush(); JobConf job = new JobConf(false); crush.setConf(job);//from w ww. ja va 2s . co m job.set("fs.default.name", "file:///"); job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); job.setInt("mapreduce.job.reduces", 20); job.setLong("dfs.blocksize", 1024 * 1024 * 64); FileSystem fs = FileSystem.get(job); fs.setWorkingDirectory(new Path(tmp.getRoot().getAbsolutePath())); crush.setFileSystem(fs); }
From source file:com.m6d.filecrush.crush.CrushReducerTest.java
License:Apache License
@Test public void missingInputRegex() { JobConf job = new JobConf(false); job.set("mapred.tip.id", "task_201011081200_14527_r_1234"); job.set("fs.default.name", "file:///"); job.set("fs.file.impl", "org.apache.hadoop.fs.LocalFileSystem"); job.set("mapred.output.dir", outDir.getAbsolutePath()); job.setLong("crush.timestamp", 98765); job.setLong("dfs.blocksize", 1024 * 1024 * 64L); job.setInt("crush.num.specs", 2); job.set("crush.0.regex", "foo"); job.set("crush.0.regex.replacement", "bar"); job.set("crush.0.input.format", SequenceFileInputFormat.class.getName()); job.set("crush.0.output.format", TextOutputFormat.class.getName()); job.set("crush.1.regex.replacement", "bar"); job.set("crush.1.input.format", SequenceFileInputFormat.class.getName()); job.set("crush.1.output.format", TextOutputFormat.class.getName()); reducer = new CrushReducer(); try {/*w w w . j a v a2 s. com*/ reducer.configure(job); fail(); } catch (IllegalArgumentException e) { if (!"No input regex: crush.1.regex".equals(e.getMessage())) { throw e; } } }