List of usage examples for org.apache.hadoop.mapred JobConf setInt
public void setInt(String name, int value)
name
property to an int
. From source file:edu.ucsb.cs.lsh.projection.ProjectionsGenerator.java
License:Apache License
public static int readCollectionFeatureCount(FileSystem hdfs, JobConf job) throws IOException { Path nFeaturesPath = new Path(Properties.NUM_FEATURES_FILE); if (hdfs.exists(nFeaturesPath)) { BufferedReader br = new BufferedReader( new InputStreamReader(new DataInputStream(new FileInputStream(nFeaturesPath.toString())))); String line;/*from w w w. j a va 2s .co m*/ if ((line = br.readLine()) != null) job.setInt(ProjectionLshDriver.LSH_NFEATURES_PROPERTY, Integer.parseInt(line)); } return job.getInt(ProjectionLshDriver.LSH_NFEATURES_PROPERTY, ProjectionLshDriver.LSH_NFEATURES_VALUE); }
From source file:edu.ucsb.cs.lsh.projection.SignaturesGenerator.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf job = new JobConf(SignaturesGenerator.class); new GenericOptionsParser(job, args); job.setJobName(SignaturesGenerator.class.getSimpleName()); int nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE); setParameters();/*from www . j ava2 s . c o m*/ FileSystem fs = FileSystem.get(job); prepareDistributedCache(job, fs, new Path(ProjectionsGenerator.OUTPUT_DIR)); Path outputPath = new Path(OUTPUT_DIR); if (fs.exists(outputPath)) fs.delete(outputPath); FileInputFormat.setInputPaths(job, INPUT_DIR); // Path(INPUT_DIR)); FileOutputFormat.setOutputPath(job, outputPath); // FileOutputFormat.setCompressOutput(job, false); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.set("mapred.child.java.opts", "-Xmx2048m"); job.setInt("mapred.map.max.attempts", 10); job.setInt("mapred.reduce.max.attempts", 10); job.setInt("mapred.task.timeout", 6000000); job.setMapperClass(SigMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BitSignature.class); job.setNumReduceTasks(0); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BitSignature.class); JobSubmitter.run(job, "LSH", -1); }
From source file:edu.umd.cloud9.collection.clue.RepackClueWarcRecords.java
License:Apache License
/** * Runs this tool.// w ww . j a va 2s .co m */ public int run(String[] args) throws Exception { if (args.length != 5) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; int segment = Integer.parseInt(args[2]); String data = args[3]; String compressionType = args[4]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // Default block size. int blocksize = 1000000; JobConf conf = new JobConf(RepackClueWarcRecords.class); conf.setJobName("RepackClueWarcRecords:segment" + segment); conf.set("DocnoMappingDataFile", data); LOG.info("Tool name: RepackClueWarcRecords"); LOG.info(" - base path: " + basePath); LOG.info(" - output path: " + outputPath); LOG.info(" - segment number: " + segment); LOG.info(" - docno mapping data file: " + data); LOG.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { LOG.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(ClueWarcInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ClueWarcRecord.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.trecweb.RepackGov2Documents.java
License:Apache License
/** * Runs this tool./*from w w w . j av a 2 s.c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; String compressionType = args[2]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // this is the default block size int blocksize = 1000000; JobConf conf = new JobConf(RepackGov2Documents.class); conf.setJobName("RepackGov2Documents"); sLogger.info("Tool name: RepackGov2Documents"); sLogger.info(" - base path: " + basePath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { sLogger.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(500); // 272 for (int i = 0; i <= 272; i++) { String path = basePath + "/GX"; String indexNum = Integer.toString(i); if (indexNum.length() == 1) { path += "00"; } if (indexNum.length() == 2) { path += "0"; } path += indexNum; FileInputFormat.addInputPath(conf, new Path(path)); } SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(TrecWebDocumentInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TrecWebDocument.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.trecweb.RepackWt10gDocuments.java
License:Apache License
/** * Runs this tool./*from w w w. j av a 2 s .co m*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; String compressionType = args[2]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // this is the default block size int blocksize = 1000000; JobConf conf = new JobConf(RepackWt10gDocuments.class); conf.setJobName("RepackWt10gDocuments"); sLogger.info("Tool name: RepackWt10gDocuments"); sLogger.info(" - base path: " + basePath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { sLogger.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(50); for (int i = 1; i <= 104; i++) { String path = basePath + "/WTX"; String indexNum = Integer.toString(i); if (indexNum.length() == 1) { path += "00"; } if (indexNum.length() == 2) { path += "0"; } path += indexNum; FileInputFormat.addInputPath(conf, new Path(path)); } SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(TrecWebDocumentInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TrecWebDocument.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.BuildPageRankRecords.java
License:Apache License
/** * Runs this tool./*from ww w . j a v a 2 s.c om*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int n = Integer.parseInt(args[2]); sLogger.info("Tool name: BuildPageRankRecords"); sLogger.info(" - inputDir: " + inputPath); sLogger.info(" - outputDir: " + outputPath); sLogger.info(" - numNodes: " + n); JobConf conf = new JobConf(BuildPageRankRecords.class); conf.setJobName("PackageLinkGraph"); conf.setNumMapTasks(1); conf.setNumReduceTasks(0); conf.setInt("NodeCnt", n); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); TextInputFormat.addInputPath(conf, new Path(inputPath)); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.FindMaxPageRankNodes.java
License:Apache License
/** * Runs this tool.//from www . j a va2s .com */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int n = Integer.parseInt(args[2]); sLogger.info("Tool name: FindMaxPageRankNodes"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); sLogger.info(" - n: " + n); JobConf conf = new JobConf(FindMaxPageRankNodes.class); conf.setJobName("FindMaxPageRankNodes"); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setInt("n", n); FileInputFormat.addInputPath(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(FloatWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.PartitionGraph.java
License:Apache License
public int run(String[] args) throws IOException { if (args.length != 5) { printUsage();/*from w w w . ja v a 2s . c o m*/ return -1; } String inPath = args[0]; String outPath = args[1]; int numParts = Integer.parseInt(args[2]); boolean useRange = Integer.parseInt(args[3]) != 0; int nodeCount = Integer.parseInt(args[4]); sLogger.info("Tool name: PartitionGraph"); sLogger.info(" - inputDir: " + inPath); sLogger.info(" - outputDir: " + outPath); sLogger.info(" - numPartitions: " + numParts); sLogger.info(" - useRange?: " + useRange); sLogger.info(" - nodeCnt: " + nodeCount); JobConf conf = new JobConf(PartitionGraph.class); conf.setJobName("Partition Graph " + numParts); conf.setNumReduceTasks(numParts); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("NodeCount", nodeCount); FileInputFormat.setInputPaths(conf, new Path(inPath)); FileOutputFormat.setOutputPath(conf, new Path(outPath)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MapClass.class); conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } FileSystem.get(conf).delete(new Path(outPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); String in = path + "/iter" + sFormat.format(i); String out = path + "/iter" + sFormat.format(j) + "t"; String outm = out + "-mass"; // we need to actually count the number of part files to get the number // of partitions (because the directory might contain _log) int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;/*w ww . j a v a2 s .c o m*/ } sLogger.info("PageRank: iteration " + j + ": Phase1"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); sLogger.info(" - nodeCnt: " + n); sLogger.info(" - useCombiner: " + useCombiner); sLogger.info(" - useInmapCombiner: " + useInmapCombiner); sLogger.info(" - useRange: " + useRange); sLogger.info("computed number of partitions: " + numPartitions); int numMapTasks = numPartitions; int numReduceTasks = numPartitions; conf.setJobName("PageRank:Basic:iteration" + j + ":Phase1"); conf.setInt("NodeCount", n); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { conf.setMapperClass(MapWithInMapperCombiningClass.class); } else { conf.setMapperClass(MapClass.class); } if (useCombiner) { conf.setCombinerClass(CombineClass.class); } if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); JobClient.runJob(conf); float mass = Float.NEGATIVE_INFINITY; FileSystem fs = FileSystem.get(conf); for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java
License:Apache License
private void phase2(String path, int i, int j, int n, float missing) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); sLogger.info("missing PageRank mass: " + missing); sLogger.info("number of nodes: " + n); String in = path + "/iter" + sFormat.format(j) + "t"; String out = path + "/iter" + sFormat.format(j); sLogger.info("PageRank: iteration " + j + ": Phase2"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); int numMapTasks = FileSystem.get(conf).listStatus(new Path(in)).length; int numReduceTasks = 0; conf.setJobName("PageRank:Basic:iteration" + j + ":Phase2"); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.setFloat("MissingMass", (float) missing); conf.setInt("NodeCount", n); conf.setNumMapTasks(numMapTasks);//w ww. j a va 2 s . c om conf.setNumReduceTasks(numReduceTasks); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(PageRankNode.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); conf.setMapperClass(MapPageRankMassDistributionClass.class); conf.setCombinerClass(IdentityReducer.class); conf.setReducerClass(IdentityReducer.class); FileSystem.get(conf).delete(new Path(out), true); JobClient.runJob(conf); }