List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass)
From source file:org.cloudata.examples.web.DocFreqJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java DocFreqJob <num of repeats> docFreq [#reduce]"); System.exit(0);/*w w w. j a v a 2 s . c o m*/ } JobConf jobConf = new JobConf(DocFreqJob.class); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2; if (options.length > 0) { maxReduce = Integer.parseInt(options[0]); } CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, TermUploadJob.TERM_TABLE)) { TableSchema temrTableInfo = new TableSchema(TermUploadJob.TERM_TABLE, "Test", TermUploadJob.TERM_TABLE_COLUMNS); CTable.createTable(nconf, temrTableInfo); } Path tempOutputPath = new Path("DocFreqJob_" + System.currentTimeMillis()); jobConf.setJobName("DocFreqJob" + "(" + new Date() + ")"); //<MAP> jobConf.setMapperClass(DocFreqMap.class); jobConf.setInputFormat(AbstractTabletInputFormat.class); jobConf.set(AbstractTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE); jobConf.set(AbstractTabletInputFormat.INPUT_COLUMN_LIST, WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); // jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(DocFreqReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(maxReduce); //</REDUCE> //Run Job JobClient.runJob(jobConf); // //delete temp output path FileSystem fs = FileSystem.get(jobConf); fs.delete(tempOutputPath, true); }
From source file:org.cloudata.examples.web.TermGlobalJob.java
License:Apache License
public void exec() throws Exception { CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, GLOBAL_TABLE)) { TableSchema globalTableInfo = new TableSchema(GLOBAL_TABLE, "Test", GLOBAL_TABLE_COLUMNS); CTable.createTable(nconf, globalTableInfo); }/* www . j a va 2 s. com*/ Path tempOutputPath = new Path("globalTableInfo" + System.currentTimeMillis()); JobConf jobConf = new JobConf(WebTableJob.class); jobConf.setJobName("TermGlobalJob" + "(" + new Date() + ")"); //<MAP> jobConf.setMapperClass(TermGlobalMap.class); jobConf.setInputFormat(DefaultTabletInputFormat.class); jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, WebTableJob.WEB_TABLE_COLUMNS[2]); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); //</MAP> //<REDUCE> jobConf.setReducerClass(TermGlobalReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setNumReduceTasks(1); jobConf.setMaxReduceAttempts(0); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); //</REDUCE> //Run Job JobClient.runJob(jobConf); //delete temp output path FileSystem fs = FileSystem.get(jobConf); fs.delete(tempOutputPath, true); }
From source file:org.cloudata.examples.web.TermUploadJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]"); System.exit(0);/*from w w w . j ava 2s . c o m*/ } JobConf jobConf = new JobConf(TermUploadJob.class); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2; if (options.length > 1) { maxReduce = Integer.parseInt(options[1]); } jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000); FileSystem fs = FileSystem.get(jobConf); CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, TERM_TABLE)) { //Table Path path = new Path("blogdata/tmp/weight"); FileStatus[] paths = fs.listStatus(path); if (paths == null || paths.length == 0) { LOG.error("No Partition info:" + path); return; } SortedSet<Text> terms = new TreeSet<Text>(); Text text = new Text(); for (FileStatus eachPath : paths) { CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath())); while (true) { int length = reader.readLine(text); if (length <= 0) { break; } terms.add(new Text(text)); } } int temrsPerTablet = terms.size() / (maxReduce - 1); int count = 0; List<Row.Key> rowKeys = new ArrayList<Row.Key>(); for (Text term : terms) { count++; if (count == temrsPerTablet) { rowKeys.add(new Row.Key(term.getBytes())); count = 0; } } rowKeys.add(Row.Key.MAX_KEY); TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS); CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {})); } CTable termTable = CTable.openTable(nconf, TERM_TABLE); TabletInfo[] tabletInfos = termTable.listTabletInfos(); Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis()); jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")"); FileInputFormat.addInputPath(jobConf, new Path(options[0])); //<MAP> jobConf.setMapperClass(TermUploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setInputFormat(TextInputFormat.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE); jobConf.setPartitionerClass(WebKeyRangePartitioner.class); jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(TermUploadReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setNumReduceTasks(tabletInfos.length); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(maxReduce); jobConf.setMaxReduceAttempts(0); //<REDUCE> //Run Job JobClient.runJob(jobConf); fs.delete(tempOutputPath); }
From source file:org.cloudata.examples.web.TermWeightJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TermWeightJob <num of repeats> termWeight <outputPath> [noGlobal]"); System.exit(0);/*ww w. j a va2s .c o m*/ } if (options.length == 1 || !options[1].equals("noGlobal")) { TermGlobalJob termGlobalJob = new TermGlobalJob(); termGlobalJob.exec(); } Path outputPath = new Path(options[0]); JobConf jobConf = new JobConf(WebTableJob.class); jobConf.setJobName("TermWeightJob" + "(" + new Date() + ")"); //<MAP> jobConf.setMapperClass(TermWeightMap.class); jobConf.setInputFormat(DefaultTabletInputFormat.class); jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); // jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(TermWeightReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(jobConf, outputPath); //jobConf.setMaxReduceAttempts(0); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks(); jobConf.setNumReduceTasks(maxReduce); //</REDUCE> //Run Job JobClient.runJob(jobConf); // //delete temp output path // FileSystem fs = FileSystem.get(jobConf); // fs.delete(outputPath); }
From source file:org.cloudata.examples.web.TermWeightJobOnline.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TermWeightJobOnline <num of repeats> termWeightOnline [noGlobal]"); System.exit(0);/*from w ww . java2s .c o m*/ } if (options.length == 1 || !options[1].equals("noGlobal")) { TermGlobalJob termGlobalJob = new TermGlobalJob(); termGlobalJob.exec(); } Path outputPath = new Path("TermWeightJobOnline_" + System.currentTimeMillis()); JobConf jobConf = new JobConf(TermWeightJobOnline.class); jobConf.setJobName("TermWeightJobOnline" + "(" + new Date() + ")"); //<MAP> jobConf.setMapperClass(TermWeightMap.class); jobConf.setInputFormat(DefaultTabletInputFormat.class); jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, WebTableJob.WEB_TABLE); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, WebTableJob.WEB_TABLE_COLUMNS[1] + "," + WebTableJob.WEB_TABLE_COLUMNS[2]); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); // jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(TermWeightReduceOnline.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); FileOutputFormat.setOutputPath(jobConf, outputPath); //jobConf.setMaxReduceAttempts(0); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks(); jobConf.setNumReduceTasks(maxReduce); //</REDUCE> //Run Job try { JobClient.runJob(jobConf); } finally { //delete temp output path FileSystem fs = FileSystem.get(jobConf); fs.delete(outputPath, true); } }
From source file:org.cloudata.examples.weblink.InLinkJob.java
License:Apache License
public static void main(String[] args) throws IOException { if (args.length < 2) { System.out.println("Usage: java InLinkJob <input table> <output table>"); System.exit(0);//from w ww . ja v a 2 s . c o m } String inputTableName = args[0]; String outputTableName = args[1]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, outputTableName)) { TableSchema tableSchema = new TableSchema(outputTableName); tableSchema.addColumn("inlink"); CTable.createTable(nconf, tableSchema); } Path tempPath = new Path("InLinkJob_tmp" + System.currentTimeMillis()); JobConf jobConf = new JobConf(InLinkJob.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); //<Map> jobConf.setMapperClass(InLinkMap.class); jobConf.setInputFormat(DefaultTabletInputFormat.class); jobConf.set(DefaultTabletInputFormat.INPUT_TABLE, inputTableName); jobConf.set(DefaultTabletInputFormat.INPUT_COLUMN_LIST, "outlink"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(TextArray.class); //</Map> //<Reduce> FileOutputFormat.setOutputPath(jobConf, tempPath); jobConf.setReducerClass(InLinkReduce.class); jobConf.set(DefaultTabletInputFormat.OUTPUT_TABLE, outputTableName); //</Reduce> try { JobClient.runJob(jobConf); } finally { CloudataFileSystem fs = CloudataFileSystem.get(nconf); fs.delete(new GPath(tempPath.toString()), true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.weblink.ScanJob.java
License:Apache License
public void run(String[] args) throws IOException { if (args.length < 1) { System.out.println("Usage: java ScanJob <table name>"); System.exit(0);//from w w w .ja v a2 s . c om } String tableName = args[0]; CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, tableName)) { System.out.println("No table: " + tableName); System.exit(0); } JobConf jobConf = new JobConf(UploadJob.class); jobConf.setJobName("CloudataExamles.weblink.ScanJob_" + new Date()); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); // <MAP> jobConf.setInputFormat(DefaultTabletInputFormat.class); jobConf.setMapperClass(ScanJobMapper.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.set(AbstractTabletInputFormat.INPUT_TABLE, tableName); jobConf.set(AbstractTabletInputFormat.INPUT_COLUMN_LIST, "url"); // </MAP> // <REDUCE> FileOutputFormat.setOutputPath(jobConf, new Path("CloudataExamles_WebScanJob_" + System.currentTimeMillis())); jobConf.setReducerClass(ScanJobReducer.class); jobConf.setNumReduceTasks(1); // </REDUCE> try { JobClient.runJob(jobConf); } catch (Exception e) { e.printStackTrace(); } finally { CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.util.matrix.AbstractMatrix.java
License:Apache License
public void mutiply(AbstractMatrix targetMatrix, AbstractMatrix resultMatrix) throws IOException { Path tempOutputPath = new Path("temp/Matrix_" + System.currentTimeMillis()); JobConf jobConf = new JobConf(AbstractMatrix.class); jobConf.setJobName("Matrix_Mutiply_Job" + "(" + new Date() + ")"); //<MAP> jobConf.setMapperClass(MatrixMutiplyMap.class); jobConf.setInputFormat(MatrixInputFormat.class); jobConf.set(MatrixInputFormat.MATRIX_INPUT_TABLE, ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_INPUT_COLUMN, columnName); jobConf.set(MatrixInputFormat.MATRIX_TARGET_TABLE, targetMatrix.ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_TARGET_COLUMN, targetMatrix.columnName); jobConf.setBoolean(MatrixInputFormat.MATRIX_TARGET_SPARSE, targetMatrix.isSparse()); jobConf.setMapOutputKeyClass(MatrixItem.class); jobConf.setMapOutputValueClass(Text.class); //</MAP> //<REDUCE> jobConf.setPartitionerClass(KeyRangePartitioner.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, resultMatrix.ctable.getTableName()); jobConf.setReducerClass(MatrixMutiplyReduce.class); jobConf.set(MatrixInputFormat.MATRIX_RESULT_TABLE, resultMatrix.ctable.getTableName()); jobConf.set(MatrixInputFormat.MATRIX_RESULT_COLUMN, resultMatrix.columnName); jobConf.setBoolean(MatrixInputFormat.MATRIX_RESULT_SPARSE, resultMatrix.isSparse()); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); TabletInfo[] tabletInfos = resultMatrix.ctable.listTabletInfos(); jobConf.setNumReduceTasks(tabletInfos.length); jobConf.setMaxReduceAttempts(0);//from ww w . ja v a 2s . co m FileOutputFormat.setOutputPath(jobConf, tempOutputPath); //</REDUCE> //Run Job JobClient.runJob(jobConf); //delete temp output path FileSystem fs = FileSystem.get(jobConf); fs.delete(tempOutputPath, true); }
From source file:org.cloudata.util.upload.UploadUtil.java
License:Apache License
private void doHadoopUpload(CloudataConf conf) throws IOException { if (!CTable.existsTable(conf, tableName)) { throw new IOException("No table:" + tableName); }/*from w w w. j av a2 s.c o m*/ JobConf jobConf = new JobConf(UploadUtil.class); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); jobConf.setJobName("UploadJob_" + tableName + "(" + new Date() + ")"); //KeyRangePartitioner //AbstractTabletInputFormat.OUTPUT_TABLE? ? jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); //<Map> FileInputFormat.addInputPath(jobConf, new Path(inputPath)); jobConf.setInputFormat(TextInputFormat.class); jobConf.set("uploadJob.delim", delim); String columnStr = ""; for (String eachColumn : columns) { columnStr += eachColumn + ","; } jobConf.set("uploadJob.columns", columnStr); String fieldNumStr = ""; for (int eachField : fieldNums) { fieldNumStr += eachField + ","; } jobConf.set("uploadJob.fieldNums", fieldNumStr); jobConf.setBoolean("uploadJob.keyValuePair", keyValuePair); jobConf.setMapperClass(UploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0); //</Map> //<Reduce> Path tempOutputPath = new Path("temp/uploadJob/" + tableName + "/reducer"); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(0); //</Reduce> try { JobClient.runJob(jobConf); } finally { FileSystem fs = FileSystem.get(jobConf); FileUtil.delete(fs, tempOutputPath, true); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java
License:Open Source License
public static void main(String[] args) { String accessKey = args[0];/* w w w . j av a 2 s .c om*/ String secretKey = args[1]; String paths[] = { // "2008/06", // "2008/07", // "2008/08", // "2008/09", // "2008/10", // "2008/11", "2009" }; for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) { LOG.info("Processing Path:" + paths[pathIndex]); JobConf job = new JobConf(S3GetMetdataJob.class); Path tempDir = new Path( job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir); System.out.println("Output Path is:" + tempDir); job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]); // setup s3 properties JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); ARCSplitCalculator.setFilesPerSplit(job, 25); // set up arc reader properties ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // and set up input format ... job.setInputFormat(ARCInputFormat.class); // set mapper ... job.setMapRunnerClass(S3GetMetdataJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlURLMetadata.class); // reduce output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlURLMetadata.class); // double the number of reducers ... // job.setNumReduceTasks(job.getNumReduceTasks() * 2); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result"); LOG.info("Copying Job Output to:" + finalPath); FileSystem fs = FileSystem.get(job); try { fs.mkdirs(finalPath.getParent()); fs.rename(tempDir, finalPath); LOG.info("Copied Job Output to:" + finalPath); } finally { // fs.close(); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } }