List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:com.cloudera.knittingboar.sgd.TestRunRCV1Subset.java
License:Apache License
public void testSplits() throws IOException { // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); // TODO: work on this, splits are generating for everything in dir InputSplit[] splits = generateDebugSplits(inputDir, job); System.out.println("split count: " + splits.length); assertEquals(10, splits.length);/*from ww w .j a va 2 s . c o m*/ InputSplit[] splits_full = generateDebugSplits(fullRCV1Dir, job); System.out.println("full rcv1 split count: " + splits_full.length); Text value = new Text(); for (int x = 0; x < splits_full.length; x++) { InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits_full[x]); custom_reader_0.next(value); System.out.println(x + " > " + value.toString()); custom_reader_0.next(value); System.out.println(x + " > " + value.toString()); custom_reader_0.next(value); System.out.println(x + " > " + value.toString() + "\n"); } }
From source file:com.cloudera.knittingboar.sgd.TestRunRCV1Subset.java
License:Apache License
public void testRunRCV1Subset() throws IOException, Exception { int num_passes = 15; POLRMasterDriver master = new POLRMasterDriver(); // ------------------ // generate the debug conf ---- normally setup by YARN stuff master.setConf(this.generateDebugConfigurationObject()); // now load the conf stuff into locally used vars try {//from ww w. ja v a2 s . c o m master.LoadConfigVarsLocally(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("Conf load fail: shutting down."); assertEquals(0, 1); } // now construct any needed machine learning data structures based on config master.Setup(); // ------------------ // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); InputSplit[] splits = generateDebugSplits(fullRCV1Dir, job); System.out.println("split count: " + splits.length); ArrayList<POLRWorkerDriver> workers = new ArrayList<POLRWorkerDriver>(); for (int x = 0; x < splits.length; x++) { POLRWorkerDriver worker_model_builder = new POLRWorkerDriver(); //workers.get(x); worker_model_builder.internalID = String.valueOf(x); // simulates the conf stuff worker_model_builder.setConf(this.generateDebugConfigurationObject()); InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[x]); // TODO: set this up to run through the conf pathways worker_model_builder.setupInputSplit(custom_reader_0); worker_model_builder.LoadConfigVarsLocally(); worker_model_builder.Setup(); workers.add(worker_model_builder); System.out.println("> Setup Worker " + x); } for (int x = 0; x < num_passes; x++) { for (int worker_id = 0; worker_id < workers.size(); worker_id++) { workers.get(worker_id).RunNextTrainingBatch(); GradientUpdateMessage msg0 = workers.get(worker_id).GenerateUpdateMessage(); master.AddIncomingGradientMessageToQueue(msg0); master.RecvGradientMessage(); // process msg } if (x < num_passes - 1) { master.GenerateGlobalUpdateVector(); GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue(); // process global updates for (int worker_id = 0; worker_id < workers.size(); worker_id++) { workers.get(worker_id).ProcessIncomingParameterVectorMessage(returned_msg); } System.out.println("---------- cycle " + x + " done ------------- "); } else { System.out.println("---------- cycle " + x + " done ------------- "); System.out.println("> Saving Model..."); master.SaveModelLocally("/tmp/master_sgd.model"); } // if } // for workers.get(0).Debug(); }
From source file:com.cloudera.knittingboar.sgd.TestWorkerAndMaster.java
License:Apache License
/** * 1. Setup Worker// w w w . j a v a 2s .c o m * * 2. Generate some gradient * * 3. Construct a gradient update message * * 4. simulate generation of a PVec update message * * 5. update the local POLR driver w the PVec message * * 6. check the local gradient and pvec matrices * @throws Exception * */ public void testBasicMessageFlowBetweenMasterAndWorker() throws Exception { // 1. Setup Worker --------------------------------------------- System.out.println("\n------ testBasicMessageFlowBetweenMasterAndWorker --------- "); POLRMasterDriver master = new POLRMasterDriver(); //master.LoadConfig(); // ------------------ // generate the debug conf ---- normally setup by YARN stuff master.setConf(this.generateDebugConfigurationObject()); // now load the conf stuff into locally used vars try { master.LoadConfigVarsLocally(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println("Conf load fail: shutting down."); assertEquals(0, 1); } // now construct any needed machine learning data structures based on config master.Setup(); // ------------------ POLRWorkerDriver worker_model_builder = new POLRWorkerDriver(); // generate the debug conf ---- normally setup by YARN stuff worker_model_builder.setConf(this.generateDebugConfigurationObject()); // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); InputSplit[] splits = generateDebugSplits(workDir, job); InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]); // TODO: set this up to run through the conf pathways worker_model_builder.setupInputSplit(custom_reader); worker_model_builder.LoadConfigVarsLocally(); worker_model_builder.Setup(); for (int x = 0; x < 25; x++) { worker_model_builder.RunNextTrainingBatch(); System.out.println("---------- cycle " + x + " done ------------- "); } // for worker_model_builder.polr.Debug_PrintGamma(); // 3. generate a gradient update message --------------------------------------------- GradientUpdateMessage msg = worker_model_builder.GenerateUpdateMessage(); //msg.gradient.Debug(); master.AddIncomingGradientMessageToQueue(msg); master.RecvGradientMessage(); // process msg // 5. pass global pvector update message back to worker process, update driver pvector master.GenerateGlobalUpdateVector(); GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue(); //returned_msg.parameter_vector.set(0, 0, -1.0); //Utils.PrintVector(returned_msg.parameter_vector.viewRow(0)); worker_model_builder.ProcessIncomingParameterVectorMessage(returned_msg); //returned_msg.parameter_vector.set(0, 0, -1.0); System.out.println("---------- "); Utils.PrintVector(returned_msg.parameter_vector.viewRow(0)); // System.out.println( "Master Param Vector: " + returned_msg.parameter_vector.get(0, 0) + ", " + returned_msg.parameter_vector.get(0, 1) ); // assertEquals( -1.0, returned_msg.parameter_vector.get(0, 0) ); // assertEquals( 1.0, returned_msg.parameter_vector.get(0, 1) ); //worker.ProcessIncomingParameterVectorMessage(returned_msg); worker_model_builder.polr.Debug_PrintGamma(); }
From source file:com.cloudera.knittingboar.sgd.TestWorkerAndMaster.java
License:Apache License
/** * Runs 10 passes of 2 subsets of the donut data * - between each pass, the parameter vector is updated * - at the end, we compare to OLR/*from ww w.j a va 2 s. c om*/ * @throws Exception * */ public void testOLRvs10PassesOfPOLR() throws Exception { // config ------------ System.out.println("\n------ testOLRvs10PassesOfPOLR --------- "); POLRWorkerDriver olr_run = new POLRWorkerDriver(); // generate the debug conf ---- normally setup by YARN stuff olr_run.setConf(this.generateDebugConfigurationObject()); // ---- this all needs to be done in JobConf job = new JobConf(defaultConf); InputSplit[] splits = generateDebugSplits(workDir, job); InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]); // TODO: set this up to run through the conf pathways olr_run.setupInputSplit(custom_reader); olr_run.LoadConfigVarsLocally(); olr_run.Setup(); for (int x = 0; x < 25; x++) { olr_run.RunNextTrainingBatch(); System.out.println("---------- cycle " + x + " done ------------- "); } // for /* * * * * * ----------------------- now run the parallel version ----------------- * * * * * * * */ /* System.out.println( "\n\n------- POLR: Start ---------------" ); POLRMasterDriver master = new POLRMasterDriver(); //master.LoadConfig(); // ------------------ // generate the debug conf ---- normally setup by YARN stuff master.debug_setConf(this.generateDebugConfigurationObject()); // now load the conf stuff into locally used vars try { master.LoadConfigVarsLocally(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println( "Conf load fail: shutting down." ); assertEquals( 0, 1 ); } // now construct any needed machine learning data structures based on config master.Setup(); // ------------------ //LogisticModelBuilder model_builder = new LogisticModelBuilder(); POLRWorkerDriver worker = new POLRWorkerDriver(); // ------------------ // generate the debug conf ---- normally setup by YARN stuff worker.debug_setConf(this.generateDebugConfigurationObject()); // now load the conf stuff into locally used vars try { worker.LoadConfigVarsLocally(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); System.out.println( "Conf load fail: shutting down." ); assertEquals( 0, 1 ); } // now construct any needed machine learning data structures based on config worker.Setup(); worker.DebugPrintConfig(); // ------------------ // 2. Train a batch, generate some gradient --------------------------------------------- for (int pass = 0; pass < passes; pass++) { // --- run a pass ------------------------- BufferedReader in = open(inputFile); String line = in.readLine(); line = in.readLine(); // skip first line IF this is a CSV while (line != null) { worker.IncrementallyTrainModelWithRecord( line ); line = in.readLine(); } // while in.close(); // ------- end of the pass ------------------- // -------------- simulate message passing -------------- GradientUpdateMessage msg0 = worker.GenerateUpdateMessage(); master.AddIncomingGradientMessageToQueue(msg0); master.RecvGradientMessage(); // process msg GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue(); worker.ProcessIncomingParameterVectorMessage(returned_msg); System.out.println( "POLR: Updating Worker Parameter Vector" ); } // for System.out.println( "POLR: Debug Beta / Gamma" ); worker.polr.Debug_PrintGamma(); // 3. generate a gradient update message --------------------------------------------- */ }
From source file:com.cloudera.recordservice.avro.mapred.ColorCount.java
License:Apache License
/** * Run the MR1 color count with generic records, and return a map of favorite colors to * the number of users./* w w w . j a v a2 s.com*/ */ public static java.util.Map<String, Integer> countColors() throws IOException { String output = TestUtil.getTempDirectory(); Path outputPath = new Path(output); JobConf conf = new JobConf(ColorCount.class); conf.setJobName("MR1 Color Count With Generic Records"); conf.setInt("mapreduce.job.reduces", 1); conf.setBoolean(com.cloudera.recordservice.avro.AvroJob.USE_RECORD_SERVICE_INPUT_FORMAT_CONF_KEY, true); com.cloudera.recordservice.avro.AvroJob.setInputFormat(conf, org.apache.avro.mapred.AvroInputFormat.class); RecordServiceConfig.setInputTable(conf, "rs", "users"); FileOutputFormat.setOutputPath(conf, outputPath); AvroJob.setMapperClass(conf, Map.class); AvroJob.setReducerClass(conf, Reduce.class); AvroJob.setOutputSchema(conf, Pair.getPairSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT))); JobClient.runJob(conf); // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. SeekableInput input = new FsInput(new Path(output + "/part-00000.avro"), conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>(); for (GenericRecord datum : fileReader) { colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString())); } return colorMap; }
From source file:com.cloudera.recordservice.avro.mapreduce.ColorCount.java
License:Apache License
/** * Run the MR2 color count with generic records, and return a map of favorite colors to * the number of users.// www . jav a 2 s . c om */ public static java.util.Map<String, Integer> countColors() throws IOException, ClassNotFoundException, InterruptedException { String output = TestUtil.getTempDirectory(); Path outputPath = new Path(output); JobConf conf = new JobConf(ColorCount.class); conf.setInt("mapreduce.job.reduces", 1); Job job = Job.getInstance(conf); job.setJarByClass(ColorCount.class); job.setJobName("MR2 Color Count With Generic Records"); RecordServiceConfig.setInputTable(job.getConfiguration(), "rs", "users"); job.setInputFormatClass(com.cloudera.recordservice.avro.mapreduce.AvroKeyInputFormat.class); FileOutputFormat.setOutputPath(job, outputPath); job.setMapperClass(Map.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputFormatClass(AvroKeyValueOutputFormat.class); job.setReducerClass(Reduce.class); AvroJob.setOutputKeySchema(job, Schema.create(Schema.Type.STRING)); AvroJob.setOutputValueSchema(job, Schema.create(Schema.Type.INT)); job.waitForCompletion(false); // Read the result and return it. Since we set the number of reducers to 1, // there is always just one file containing the value. SeekableInput input = new FsInput(new Path(output + "/part-r-00000.avro"), conf); DatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(); FileReader<GenericRecord> fileReader = DataFileReader.openReader(input, reader); java.util.Map<String, Integer> colorMap = new HashMap<String, Integer>(); for (GenericRecord datum : fileReader) { colorMap.put(datum.get(0).toString(), Integer.parseInt(datum.get(1).toString())); } return colorMap; }
From source file:com.cloudera.recordservice.examples.mapreduce.WordCount.java
License:Apache License
public void run(String[] args) throws Exception { boolean useRecordService = true; if (args.length == 3) { useRecordService = Boolean.parseBoolean(args[2]); } else if (args.length != 2) { System.err.println("Usage: WordCount <input path> <output path>"); System.exit(-1);/*from w w w . j ava 2s .c o m*/ } String input = args[0].trim(); String output = args[1]; JobConf conf = new JobConf(WordCount.class); conf.setJobName("wordcount-" + (useRecordService ? "with" : "without") + "-RecordService"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); if (useRecordService) { conf.setInputFormat(com.cloudera.recordservice.mapred.TextInputFormat.class); RecordServiceConfig.setInput(conf, input); } else { conf.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(conf, new Path(input)); } FileSystem fs = FileSystem.get(conf); Path outputPath = new Path(output); if (fs.exists(outputPath)) fs.delete(outputPath, true); conf.setOutputFormat(TextOutputFormat.class); FileOutputFormat.setOutputPath(conf, outputPath); JobClient.runJob(conf); System.out.println("Done"); }
From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java
License:Apache License
/** * Copied HiveInputFormat/*from w ww .j a va 2s . co m*/ */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { init(job); Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } JobConf newjob = new JobConf(job); List<InputSplit> result = new ArrayList<InputSplit>(); List<Path> currentDirs = new ArrayList<Path>(); Class<? extends InputFormat> currentInputFormatClass = null; TableDesc currentTable = null; TableScanOperator currentTableScan = null; // for each dir, get the InputFormat, and do getSplits. for (Path dir : dirs) { PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir); Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass(); TableDesc table = part.getTableDesc(); TableScanOperator tableScan = null; List<String> aliases = mrwork_.getPathToAliases().get(dir.toUri().toString()); // Make filter pushdown information available to getSplits. if ((aliases != null) && (aliases.size() == 1)) { Operator op = mrwork_.getAliasToWork().get(aliases.get(0)); if ((op != null) && (op instanceof TableScanOperator)) { tableScan = (TableScanOperator) op; // push down projections. ColumnProjectionUtils.appendReadColumns(newjob, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns()); // push down filters pushFilters(newjob, tableScan); } } if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) { currentDirs.add(dir); continue; } if (!currentDirs.isEmpty()) { LOG.info("Generating splits"); addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result); } currentDirs.clear(); currentDirs.add(dir); currentTableScan = tableScan; currentTable = table; currentInputFormatClass = inputFormatClass; } if (dirs.length != 0) { LOG.info("Generating splits"); addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result); } LOG.info("number of splits " + result.size()); return result.toArray(new HiveInputSplitShim[result.size()]); }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
@Test // TODO: make this generic. This should be extensible to test all the input // formats we support. How do we do this? public void testReadNation() throws IOException, InterruptedException { Configuration config = new Configuration(); RecordServiceInputFormat.RecordServiceRecordReader reader = new RecordServiceInputFormat.RecordServiceRecordReader(); try {/*from w w w. j a va 2s.co m*/ RecordServiceConfig.setInputTable(config, null, "tpch.nation"); List<InputSplit> splits = PlanUtil.getSplits(config, new Credentials()).splits; reader.initialize(splits.get(0), new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); int numRows = 0; while (reader.nextKeyValue()) { RecordServiceRecord value = reader.getCurrentValue(); ++numRows; if (numRows == 10) { assertEquals("INDONESIA", value.getColumnValue(1).toString()); } } assertFalse(reader.nextKeyValue()); assertFalse(reader.nextRecord()); assertEquals(25, numRows); config.clear(); RecordServiceConfig.setInputTable(config, "tpch", "nation", "n_comment"); splits = PlanUtil.getSplits(config, new Credentials()).splits; reader.initialize(splits.get(0), new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); numRows = 0; while (reader.nextKeyValue()) { RecordServiceRecord value = reader.getCurrentValue(); if (numRows == 12) { assertEquals("ously. final, express gifts cajole a", value.getColumnValue(0).toString()); } ++numRows; } assertEquals(25, numRows); } finally { reader.close(); } }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
@Test public void testReadAllTypes() throws IOException, InterruptedException { Configuration config = new Configuration(); RecordServiceInputFormat.RecordServiceRecordReader reader = new RecordServiceInputFormat.RecordServiceRecordReader(); SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd"); format.setTimeZone(TimeZone.getTimeZone("GMT")); try {//from w w w . j a va2 s .c o m RecordServiceConfig.setInputTable(config, null, "rs.alltypes"); List<InputSplit> splits = PlanUtil.getSplits(config, new Credentials()).splits; int numRows = 0; for (InputSplit split : splits) { reader.initialize(split, new TaskAttemptContextImpl(new JobConf(config), new TaskAttemptID())); while (reader.nextKeyValue()) { RecordServiceRecord value = reader.getCurrentValue(); if (((BooleanWritable) value.getColumnValue(0)).get()) { assertEquals(0, ((ByteWritable) value.getColumnValue(1)).get()); assertEquals(1, ((ShortWritable) value.getColumnValue(2)).get()); assertEquals(2, ((IntWritable) value.getColumnValue(3)).get()); assertEquals(3, ((LongWritable) value.getColumnValue(4)).get()); assertEquals(4.0, ((FloatWritable) value.getColumnValue(5)).get(), 0.1); assertEquals(5.0, ((DoubleWritable) value.getColumnValue(6)).get(), 0.1); assertEquals("hello", value.getColumnValue(7).toString()); assertEquals("vchar1", value.getColumnValue(8).toString()); assertEquals("char1", value.getColumnValue(9).toString()); assertEquals("2015-01-01", format .format(((TimestampNanosWritable) value.getColumnValue(10)).get().toTimeStamp())); assertEquals(new BigDecimal("3.1415920000"), ((DecimalWritable) value.getColumnValue(11)).get().toBigDecimal()); } else { assertEquals(6, ((ByteWritable) value.getColumnValue(1)).get()); assertEquals(7, ((ShortWritable) value.getColumnValue(2)).get()); assertEquals(8, ((IntWritable) value.getColumnValue(3)).get()); assertEquals(9, ((LongWritable) value.getColumnValue(4)).get()); assertEquals(10.0, ((FloatWritable) value.getColumnValue(5)).get(), 0.1); assertEquals(11.0, ((DoubleWritable) value.getColumnValue(6)).get(), 0.1); assertEquals("world", value.getColumnValue(7).toString()); assertEquals("vchar2", value.getColumnValue(8).toString()); assertEquals("char2", value.getColumnValue(9).toString()); assertEquals("2016-01-01", format .format(((TimestampNanosWritable) value.getColumnValue(10)).get().toTimeStamp())); assertEquals(new BigDecimal("1234.5678900000"), ((DecimalWritable) value.getColumnValue(11)).get().toBigDecimal()); } ++numRows; } } assertEquals(2, numRows); } finally { reader.close(); } }