List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputValueClass
public void setMapOutputValueClass(Class<?> theClass)
From source file:org.apache.sysml.runtime.transform.ApplyTfBBMR.java
License:Apache License
public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter, int replication, String headerLine) throws Exception { CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst); long[] rlens = new long[] { numRows }; long[] clens = new long[] { numColsAfter }; int[] brlens = new int[] { rblk.brlen }; int[] bclens = new int[] { rblk.bclen }; byte[] realIndexes = new byte[] { rblk.input }; byte[] resultIndexes = new byte[] { rblk.output }; JobConf job = new JobConf(ApplyTfBBMR.class); job.setJobName("ApplyTfBB"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfBBMR.class); // set relevant classes job.setMapperClass(ApplyTfBBMapper.class); MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL); MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens); MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens); MRJobConfiguration.setCSVReblockInstructions(job, rblkInst); //set up the instructions that will happen in the reducer, after the aggregation instrucions MRJobConfiguration.setInstructionsInReducer(job, otherInst); job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); //set up preferred custom serialization framework for binary block format if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION) MRJobConfiguration.addBinaryBlockSerializationFramework(job); //set up what matrices are needed to pass from the mapper to reducer HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, rblkInst, null, otherInst, resultIndexes); MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false); //set up the number of reducers int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getNumReducers(), ret.numReducerGroups);// w w w . j a va2s. c o m job.setNumReduceTasks(numRed); //set up the multiple output files, and their format information MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 }, new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false); // configure mapper and the mapper output key value pairs job.setMapperClass(ApplyTfBBMapper.class); job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class); job.setMapOutputValueClass(BlockRow.class); //configure reducer job.setReducerClass(CSVReblockReducer.class); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); //set unique working dir MRJobConfiguration.setUniqueWorkingDir(job); // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(new Path(partOffsetsFile), "part-00000"); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); // Adding "dummy" string to handle the case of na_strings = "" if (inputDataProperties.getNAStrings() != null) job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC, spec); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); RunningJob runjob = JobClient.runJob(job); MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job); Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS); for (int i = 0; i < resultIndexes.length; i++) { ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i))); } return new JobReturn(ret.stats, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.transform.ApplyTfCSVMR.java
License:Apache License
public static JobReturn runJob(String inputPath, String spec, String mapsPath, String tmpPath, String outputPath, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(ApplyTfCSVMR.class); job.setJobName("ApplyTfCSV"); /* Setup MapReduce Job */ job.setJarByClass(ApplyTfCSVMR.class); // set relevant classes job.setMapperClass(ApplyTfCSVMapper.class); job.setNumReduceTasks(0);/*from w ww . j a v a2 s. c o m*/ // Add transformation metadata file as well as partOffsetsFile to Distributed cache DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job); DistributedCache.createSymlink(job); Path cachefile = new Path(partOffsetsFile); DistributedCache.addCacheFile(cachefile.toUri(), job); DistributedCache.createSymlink(job); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(outputPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true); FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC, spec); job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath)); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString()); job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); // Since transform CSV produces part files w/ prefix transform-part-*, // delete all the "default" part-..... files deletePartFiles(fs, outPath); MatrixCharacteristics mc = new MatrixCharacteristics(); return new JobReturn(new MatrixCharacteristics[] { mc }, runjob.isSuccessful()); }
From source file:org.apache.sysml.runtime.transform.GenTfMtdMR.java
License:Apache License
public static long runJob(String inputPath, String txMtdPath, String specWithIDs, String smallestFile, String partOffsetsFile, CSVFileFormatProperties inputDataProperties, long numCols, int replication, String headerLine) throws IOException, ClassNotFoundException, InterruptedException { JobConf job = new JobConf(GenTfMtdMR.class); job.setJobName("GenTfMTD"); /* Setup MapReduce Job */ job.setJarByClass(GenTfMtdMR.class); // set relevant classes job.setMapperClass(GTFMTDMapper.class); job.setReducerClass(GTFMTDReducer.class); // set input and output properties job.setInputFormat(TextInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(DistinctValue.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); job.setInt(MRConfigurationNames.DFS_REPLICATION, replication); FileInputFormat.addInputPath(job, new Path(inputPath)); // delete outputPath, if exists already. Path outPath = new Path(txMtdPath); FileSystem fs = FileSystem.get(job); fs.delete(outPath, true);//from ww w . ja v a 2 s. co m FileOutputFormat.setOutputPath(job, outPath); job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader())); job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim()); if (inputDataProperties.getNAStrings() != null) // Adding "dummy" string to handle the case of na_strings = "" job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings())); job.set(MRJobConfiguration.TF_SPEC, specWithIDs); job.set(MRJobConfiguration.TF_SMALLEST_FILE, smallestFile); job.setLong(MRJobConfiguration.TF_NUM_COLS, numCols); job.set(MRJobConfiguration.TF_HEADER, headerLine); job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, txMtdPath); // offsets file to store part-file names and offsets for each input split job.set(MRJobConfiguration.TF_OFFSETS_FILE, partOffsetsFile); //turn off adaptivemr job.setBoolean("adaptivemr.map.enable", false); // Run the job RunningJob runjob = JobClient.runJob(job); Counters c = runjob.getCounters(); long tx_numRows = c.findCounter(MRJobConfiguration.DataTransformCounters.TRANSFORMED_NUM_ROWS).getCounter(); return tx_numRows; }
From source file:org.archive.jbs.misc.PageRank.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { System.err.println("PageRank <output> <input>..."); return 1; }/*from www .j a v a 2 s . co m*/ JobConf conf = new JobConf(getConf(), PageRank.class); conf.setJobName("jbs.PageRank"); // No need to set this since we use the MultipleInputs class // below, which allows us to specify a mapper for each input. // conf.setMapperClass(Map.class); conf.setReducerClass(Reduce.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(GenericObject.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // The input paths should be either NutchWAX segment directories // or Hadoop SequenceFiles containing JSON-encoded Documents for (int i = 1; i < args.length; i++) { Path p = new Path(args[i]); // Expand any file globs and then check each matching path FileStatus[] files = FileSystem.get(conf).globStatus(p); for (FileStatus file : files) { if (file.isDir()) { // If it's a directory, then check if it is a Nutch segment, otherwise treat as a SequenceFile. Path nwp = new Path(file.getPath(), "parse_data"); if (p.getFileSystem(conf).exists(nwp)) { LOG.info("Adding input path: " + nwp); MultipleInputs.addInputPath(conf, nwp, SequenceFileInputFormat.class, Map.class); } else { LOG.info("Adding input path: " + file.getPath()); MultipleInputs.addInputPath(conf, file.getPath(), SequenceFileInputFormat.class, Map.class); } } else { // Not a directory, skip it. LOG.warn("Not a directory, skip input: " + file.getPath()); } } } FileOutputFormat.setOutputPath(conf, new Path(args[0])); RunningJob rj = JobClient.runJob(conf); return rj.isSuccessful() ? 0 : 1; }
From source file:org.archive.nutchwax.IndexerMapReduce.java
License:Apache License
public static void initMRJob(Collection<Path> segments, JobConf job) { for (final Path segment : segments) { LOG.info("IndexerMapReduces: adding segment: " + segment); FileInputFormat.addInputPath(job, new Path(segment, ParseData.DIR_NAME)); FileInputFormat.addInputPath(job, new Path(segment, ParseText.DIR_NAME)); }/*from ww w . jav a 2 s.co m*/ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IndexerMapReduce.class); job.setReducerClass(IndexerMapReduce.class); job.setOutputFormat(IndexerOutputFormat.class); job.setOutputKeyClass(Text.class); job.setMapOutputValueClass(NutchWritable.class); job.setOutputValueClass(NutchWritable.class); }
From source file:org.cloudata.core.tablet.backup.BackupBinaryJob.java
License:Apache License
public void runBackUp(String tableName, String outputPath) throws IOException { CloudataConf nconf = new CloudataConf(); CloudataFileSystem fs = CloudataFileSystem.get(nconf); if (fs.exists(new GPath(outputPath))) { throw new IOException("Output path already exists:" + outputPath); }/*from ww w. j a v a2s .com*/ if (!CTable.existsTable(nconf, tableName)) { throw new IOException("No Table:" + tableName); } CTable ctable = CTable.openTable(nconf, tableName); String columns = ""; for (String eachColumn : ctable.getTableSchema().getColumnsArray()) { columns += eachColumn + ","; } columns = columns.substring(0, columns.length() - 1); JobConf jobConf = new JobConf(BackupBinaryJob.class); jobConf.setMapperClass(BackupBinaryMap.class); jobConf.setInputFormat(BackupTabletInputFormat.class); jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, tableName); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, columns); FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); jobConf.setMapOutputKeyClass(BytesWritable.class); jobConf.setMapOutputValueClass(BytesWritable.class); jobConf.setOutputFormat(SequenceFileOutputFormat.class); //map only jobConf.setNumReduceTasks(0); JobClient.runJob(jobConf); }
From source file:org.cloudata.examples.first.FirstMapReduce.java
License:Apache License
public static void main(String[] args) throws Exception { //Output ? ?//from www . ja v a 2s.c o m CloudataConf conf = new CloudataConf(); String outputTableName = "InvertedTable"; TableSchema outputTableSchema = new TableSchema(); outputTableSchema.addColumn("InvertedCloumn"); if (!CTable.existsTable(conf, outputTableName)) { CTable.createTable(conf, outputTableSchema); } JobConf jobConf = new JobConf(FirstMapReduce.class); jobConf.setJobName("FirstMapReduce"); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); //<Mapper> //Mapper ? jobConf.setMapperClass(FirstMapReduceMapper.class); //InputFormat? TabletInputFormat jobConf.setInputFormat(FirstMapReduceInputFormat.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); //</Mapper> //<Reducer> String outputPath = "temp/FirstMapReduce"; FileOutputFormat.setOutputPath(jobConf, new Path(outputPath)); //Reducer ? jobConf.setReducerClass(FirstMapReduceReducer.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); //Map Reduce ?? . CTable ctable = CTable.openTable(conf, "SampleTable1"); TabletInfo[] tabletInfos = ctable.listTabletInfos(); jobConf.setNumReduceTasks(tabletInfos.length); //Reduce? Tablet? ?? ? ? //Task ?? ? 0 . jobConf.setMaxReduceAttempts(0); //</Reducer> try { //Job JobClient.runJob(jobConf); } finally { //Temp FileSystem fs = FileSystem.get(jobConf); fs.delete(new Path(outputPath), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.upload.partitionjob.PartitionJob.java
License:Apache License
public boolean runJob(String inputPath, String tableName, int numOfTablets) throws IOException { JobConf jobConf = new JobConf(PartitionJob.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); FileSystem fs = FileSystem.get(jobConf); // ? /* w w w. j a v a 2 s. com*/ FileUtil.delete(fs, new Path(getLogCountFilepath(tableName)), true); jobConf.setJobName("PartitionJob_" + tableName + "(" + new Date() + ")"); jobConf.set("cloudata.numOfTablets", String.valueOf(numOfTablets)); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); String clientOpt = jobConf.get("mapred.child.java.opts"); if (clientOpt == null) { clientOpt = ""; } jobConf.set("mapred.child.java.opts", clientOpt + " -Duser.name=" + System.getProperty("user.name")); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(PartitionMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/partitionJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setReducerClass(PartitionReducer.class); //Reduce 1 jobConf.setNumReduceTasks(1); //</Reduce> try { RunningJob job = JobClient.runJob(jobConf); return job.isSuccessful(); } finally { FileUtil.delete(fs, new Path(getLogCountFilepath(tableName)), true); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.upload.partitionjob.UploadJob.java
License:Apache License
public void runJob(String inputPath, String tableName) throws IOException { JobConf jobConf = new JobConf(UploadJob.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")"); //KeyRangePartitioner //AbstractTabletInputFormat.OUTPUT_TABLE? ? jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); CloudataConf conf = new CloudataConf(); CTable ctable = CTable.openTable(conf, tableName); TabletInfo[] tabletInfos = ctable.listTabletInfos(); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(UploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0);// w w w .ja v a2 s . com jobConf.setPartitionerClass(KeyRangePartitioner.class); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setReducerClass(UploadReducer.class); jobConf.setReduceSpeculativeExecution(false); jobConf.setMaxReduceAttempts(0); //Reduce Tablet jobConf.setNumReduceTasks(tabletInfos.length); //</Reduce> try { JobClient.runJob(jobConf); } finally { FileSystem fs = FileSystem.get(jobConf); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.upload.SimpleUploaderMapReduce.java
License:Apache License
public void run(String[] args) throws IOException { if (args.length < 3) { System.out.println("Usage: java SimpleUploaderMapReduce <input path> <table name> <# reduce>"); System.exit(0);//from w ww .j a va 2 s . com } Path inputPath = new Path(args[0]); String tableName = args[1]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, tableName)) { TableSchema tableSchema = new TableSchema(tableName); tableSchema.addColumn("Col1"); Row.Key[] rowKeys = new Row.Key[20]; for (int i = 0; i < 10; i++) { rowKeys[i] = new Row.Key("-0" + i); } for (int i = 1; i < 10; i++) { rowKeys[9 + i] = new Row.Key("0" + i); } rowKeys[19] = Row.Key.MAX_KEY; CTable.createTable(nconf, tableSchema, rowKeys); } JobConf jobConf = new JobConf(HdfsToCloudataMapReduce.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); // <MAP> FileInputFormat.addInputPath(jobConf, inputPath); jobConf.setInputFormat(TextInputFormat.class); jobConf.setMapperClass(SimpleUploaderMapper.class); jobConf.setPartitionerClass(KeyRangePartitioner.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); // </MAP> // <REDUCE> FileOutputFormat.setOutputPath(jobConf, new Path("SimpleUploaderMapReduce_" + System.currentTimeMillis())); jobConf.setReducerClass(SimpleUploaderReducer.class); jobConf.setNumReduceTasks(Integer.parseInt(args[2])); jobConf.setMaxReduceAttempts(0); // </REDUCE> try { JobClient.runJob(jobConf); } catch (Exception e) { e.printStackTrace(); } finally { FileSystem fs = FileSystem.get(jobConf); fs.delete(FileOutputFormat.getOutputPath(jobConf), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }