List of usage examples for org.apache.hadoop.mapred JobConf setInt
public void setInt(String name, int value)
name
property to an int
. From source file:org.archive.hadoop.jobs.CDXGenerator.java
License:Apache License
/** * Run the job./*from w w w . j a v a 2 s .c om*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("CDX Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating CDXs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CDXGeneratorMapper.class); job.setJarByClass(CDXGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to CDXGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WARCMetadataRecordGenerator.java
License:Apache License
/** * Run the job.//from w ww .j a va2s . com */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WARCMetadataRecord Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WARCMetadataRecordGeneratorMapper.class); job.setJarByClass(WARCMetadataRecordGenerator.class); //extract outlinks by default job.set("outputType", "outlinks"); int arg = 0; if (args[arg].equals("-hopinfo")) { job.set("outputType", "hopinfo"); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WARCMetadataRecordGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WATGenerator.java
License:Apache License
/** * Run the job.//from w w w.java2s . c om */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WAT Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating WATs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WATGeneratorMapper.class); job.setJarByClass(WATGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WATGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.cloudata.core.testjob.performance.TestMultiThreadCTable.java
License:Apache License
public static Path putData(String outputDir) throws IOException { CloudataConf nconf = new CloudataConf(); JobConf jobConf = new JobConf(TestMultiThreadCTable.class); jobConf.set("user.name", nconf.getUserId()); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); jobConf.setJobName("TestMultiThreadNTable_" + "(" + new Date() + ")"); jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000); Path outputPath = new Path(outputDir); FileOutputFormat.setOutputPath(jobConf, outputPath); JobClient jobClient = new JobClient(); int numOfRowPerMap = 100000 / jobClient.getClusterStatus().getMaxMapTasks(); jobConf.setInt("numOfRowPerMap", numOfRowPerMap); //<MAP> jobConf.setMapperClass(PutDataMap.class); jobConf.setInputFormat(SimpleInputFormat.class); jobConf.setNumMapTasks(jobClient.getClusterStatus().getMaxMapTasks()); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0);/*from www . j ava 2 s. com*/ //</MAP> //<REDUCE> jobConf.setNumReduceTasks(0); //</REDUCE> try { //Run Job JobClient.runJob(jobConf); return outputPath; } finally { FileSystem fs = FileSystem.get(jobConf); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.core.testjob.tera.TeraJob.java
License:Apache License
public void runJob(String tableName, int numOfTablets, int dataLength, int totalGb, String keyOutputPath) throws IOException { CloudataConf nconf = new CloudataConf(); JobConf jobConf = new JobConf(TeraJob.class); jobConf.set("user.name", nconf.getUserId()); String libDir = CloudataMapReduceUtil.initMapReduce(jobConf); if (!CTable.existsTable(nconf, tableName)) { TableSchema tableInfo = new TableSchema(tableName, "Test"); tableInfo.addColumn(new ColumnInfo("Col1")); tableInfo.addColumn(new ColumnInfo("Col2", TableSchema.CACHE_TYPE)); tableInfo.addColumn(new ColumnInfo("Col3")); CTable.createTable(nconf, tableInfo); }//from w w w . ja v a 2 s.c om jobConf.setJobName("TeraOnlineJob" + "(" + new Date() + ")"); long rowsPerTask = ((((long) totalGb) * 1024L * 1024L * 1024L) / ((long) dataLength)) / (long) numOfTablets; jobConf.setInt("teraJob.dataLength", dataLength); jobConf.setLong("teraJob.rowsPerTask", rowsPerTask); jobConf.setLong("mapred.task.timeout", 30 * 60 * 1000); FileOutputFormat.setOutputPath(jobConf, new Path(keyOutputPath)); //<MAP> jobConf.setMapperClass(TeraOnlineMap.class); jobConf.setInputFormat(SimpleInputFormat.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, tableName); jobConf.setNumMapTasks(numOfTablets); jobConf.setMapSpeculativeExecution(false); jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setNumReduceTasks(0); //</REDUCE> try { //Run Job JobClient.runJob(jobConf); } finally { //delete temp output path FileSystem fs = FileSystem.get(jobConf); CloudataMapReduceUtil.clearMapReduce(libDir); } }
From source file:org.cloudata.examples.web.TermUploadJob.java
License:Apache License
public void exec(String[] options) throws Exception { if (options.length < 1) { System.out.println("Usage: java TermUploadJob <num of repeats> termUpload <inputPath> [#redcue]"); System.exit(0);/* ww w.j a v a2 s . c o m*/ } JobConf jobConf = new JobConf(TermUploadJob.class); JobClient jobClinet = new JobClient(jobConf); int maxReduce = jobClinet.getClusterStatus().getMaxReduceTasks() * 2; if (options.length > 1) { maxReduce = Integer.parseInt(options[1]); } jobConf.setInt("mapred.task.timeout", 60 * 60 * 1000); FileSystem fs = FileSystem.get(jobConf); CloudataConf nconf = new CloudataConf(); if (!CTable.existsTable(nconf, TERM_TABLE)) { //Table Path path = new Path("blogdata/tmp/weight"); FileStatus[] paths = fs.listStatus(path); if (paths == null || paths.length == 0) { LOG.error("No Partition info:" + path); return; } SortedSet<Text> terms = new TreeSet<Text>(); Text text = new Text(); for (FileStatus eachPath : paths) { CloudataLineReader reader = new CloudataLineReader(fs.open(eachPath.getPath())); while (true) { int length = reader.readLine(text); if (length <= 0) { break; } terms.add(new Text(text)); } } int temrsPerTablet = terms.size() / (maxReduce - 1); int count = 0; List<Row.Key> rowKeys = new ArrayList<Row.Key>(); for (Text term : terms) { count++; if (count == temrsPerTablet) { rowKeys.add(new Row.Key(term.getBytes())); count = 0; } } rowKeys.add(Row.Key.MAX_KEY); TableSchema temrTableInfo = new TableSchema(TERM_TABLE, "Test", TERM_TABLE_COLUMNS); CTable.createTable(nconf, temrTableInfo, rowKeys.toArray(new Row.Key[] {})); } CTable termTable = CTable.openTable(nconf, TERM_TABLE); TabletInfo[] tabletInfos = termTable.listTabletInfos(); Path tempOutputPath = new Path("WebTableJob_" + System.currentTimeMillis()); jobConf.setJobName("TermUploadJob" + "(" + new Date() + ")"); FileInputFormat.addInputPath(jobConf, new Path(options[0])); //<MAP> jobConf.setMapperClass(TermUploadMap.class); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(Text.class); jobConf.setInputFormat(TextInputFormat.class); jobConf.set(AbstractTabletInputFormat.OUTPUT_TABLE, TERM_TABLE); jobConf.setPartitionerClass(WebKeyRangePartitioner.class); jobConf.setMaxMapAttempts(0); //</MAP> //<REDUCE> jobConf.setReducerClass(TermUploadReduce.class); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setNumReduceTasks(tabletInfos.length); FileOutputFormat.setOutputPath(jobConf, tempOutputPath); jobConf.setNumReduceTasks(maxReduce); jobConf.setMaxReduceAttempts(0); //<REDUCE> //Run Job JobClient.runJob(jobConf); fs.delete(tempOutputPath); }
From source file:org.commoncrawl.hadoop.io.ARCInputFormat.java
License:Open Source License
/** * Sets the number of bytes to read at a time from each input stream. * /*from w w w . j a va 2 s. c o m*/ * @param job * the job to set the IO block size for * @param blockSize * the IO block size to use * * @see #P_IO_BLOCK_SIZE */ public static void setIOBlockSize(JobConf job, int blockSize) { job.setInt(P_IO_BLOCK_SIZE, blockSize); }
From source file:org.commoncrawl.hadoop.io.ARCInputFormat.java
License:Open Source License
/** * Sets the number of bytes to use for IO buffering. * //from ww w . j av a 2 s . c om * @param job * the job to set the buffer size for * @param bufferSize * the number of bytes to use for IO buffering * * @see #P_IO_BUFFER_SIZE */ public static void setIOBufferSize(JobConf job, int bufferSize) { job.setInt(P_IO_BUFFER_SIZE, bufferSize); }
From source file:org.commoncrawl.hadoop.io.ARCSplitCalculator.java
License:Open Source License
/** * Sets the desired number of files per input split. * // ww w . j ava2 s . c om * <p> * Default is 1. * * @param job * the job to set the number of files per split for * @param filesPerSplit * the desired number of ARC files per split * * @see #P_FILES_PER_SPLIT */ public static final void setFilesPerSplit(JobConf job, int filesPerSplit) { job.setInt(P_FILES_PER_SPLIT, filesPerSplit); }
From source file:org.commoncrawl.hadoop.io.ARCSplitCalculator.java
License:Open Source License
/** * Sets the desired number of megabytes per split. * /* www .ja v a2 s. c om*/ * <p> * New files will be added to a split until the total size of the split * exceeds this threshold. Default is no limit. * * @param job * the job to set the number of megabytes per split for * @param mbPerSplit * the desired number of megabytes per split */ public static final void setMegabytesPerSplit(JobConf job, int mbPerSplit) { job.setInt(P_MB_PER_SPLIT, mbPerSplit); }