List of usage examples for org.apache.hadoop.mapred JobConf set
public void set(String name, String value)
value
of the name
property. From source file:edu.ucsb.cs.lsh.projection.ProjectionsGenerator.java
License:Apache License
public static void main(JobConf job) throws IOException { int nBits/*D*/, nFeatures/*K*/, nReducers; job.setJobName(ProjectionsGenerator.class.getSimpleName()); FileSystem fs = FileSystem.get(job); nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE); nFeatures = readCollectionFeatureCount(fs, job); setParameters(nBits, nFeatures);//w w w. jav a 2 s. c om nReducers = job.getInt(ProjectionLshDriver.LSH_NREDUCER_PROPERTY, ProjectionLshDriver.LSH_NREDUCER_VALUE); Path inputPath = new Path(INPUT_DIR); Path outputPath = new Path(OUTPUT_DIR); if (fs.exists(outputPath)) fs.delete(outputPath, true); if (fs.exists(inputPath)) fs.delete(inputPath, true); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, new Path(inputPath.toString() + "/file"), IntWritable.class, IntWritable.class); for (int i = 0; i < nReducers; i++) writer.append(new IntWritable(i), new IntWritable(i)); writer.close(); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); FileOutputFormat.setCompressOutput(job, false); job.set("mapred.child.java.opts", "-Xmx2048m"); job.setInt("mapred.map.max.attempts", 10); job.setInt("mapred.reduce.max.attempts", 10); job.setNumMapTasks(1); job.setNumReduceTasks(nReducers); job.setMapperClass(IdentityMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(ProjectionReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(RandomVector.class); JobSubmitter.run(job, "LSH", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE)); }
From source file:edu.ucsb.cs.lsh.projection.SignaturesGenerator.java
License:Apache License
public static void main(String[] args) throws Exception { JobConf job = new JobConf(SignaturesGenerator.class); new GenericOptionsParser(job, args); job.setJobName(SignaturesGenerator.class.getSimpleName()); int nBits = job.getInt(ProjectionLshDriver.LSH_NBITS_PROPERTY, ProjectionLshDriver.LSH_NBITS_VALUE); setParameters();//w w w .ja v a 2 s . c o m FileSystem fs = FileSystem.get(job); prepareDistributedCache(job, fs, new Path(ProjectionsGenerator.OUTPUT_DIR)); Path outputPath = new Path(OUTPUT_DIR); if (fs.exists(outputPath)) fs.delete(outputPath); FileInputFormat.setInputPaths(job, INPUT_DIR); // Path(INPUT_DIR)); FileOutputFormat.setOutputPath(job, outputPath); // FileOutputFormat.setCompressOutput(job, false); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.set("mapred.child.java.opts", "-Xmx2048m"); job.setInt("mapred.map.max.attempts", 10); job.setInt("mapred.reduce.max.attempts", 10); job.setInt("mapred.task.timeout", 6000000); job.setMapperClass(SigMapper.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(BitSignature.class); job.setNumReduceTasks(0); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(BitSignature.class); JobSubmitter.run(job, "LSH", -1); }
From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java
License:Apache License
/** * Uniformly partitions the sequence vectors given the number of partitions * input in the configuration file. It also prepares partitions information * about its partitions: maximum p-norms, weights or norm/weights/lengths in * a file to guide the core static partitioning next for skipping. * /*from ww w.ja v a 2 s . c o m*/ * @param norm_weight_all * @return */ public static JobConf main(String[] args, int norm_weight_all) throws IOException { JobConf job = new JobConf(); new GenericOptionsParser(job, args); job.setJarByClass(Partitioner.class); System.out.println( JobSubmitter.stars() + "\n Running partitioner to prepare uniform partitionins (Single JVM) "); String inputDir = SortDriver.OUTPUT_DIR, maxDir; if (norm_weight_all == 1) maxDir = "/maxpnorm"; // maxDir = inputDir + "/maxpnorm"; else if (norm_weight_all == 2) maxDir = "/maxweight"; // maxDir = inputDir + "/maxweight"; else maxDir = "/maxall"; // maxDir = inputDir + "/maxall"; if (!(new Path(inputDir).getFileSystem(job)).exists(new Path(inputDir))) throw new UnsupportedOperationException("ERROR: " + inputDir + " directory not set."); job.set(MAX_DIR_PATH, maxDir); job.set(Config.NUM_PARTITIONS_PROPERTY, Integer.toString(produceStaticParitions(job, inputDir, OUTPUT_DIR, maxDir, job.getInt(Config.NUM_PARTITIONS_PROPERTY, Config.NUM_PARTITIONS_VALUE), norm_weight_all))); return job; }
From source file:edu.umd.cloud9.collection.clue.RepackClueWarcRecords.java
License:Apache License
/** * Runs this tool.// w w w .j ava 2 s .com */ public int run(String[] args) throws Exception { if (args.length != 5) { printUsage(); return -1; } String basePath = args[0]; String outputPath = args[1]; int segment = Integer.parseInt(args[2]); String data = args[3]; String compressionType = args[4]; if (!compressionType.equals("block") && !compressionType.equals("record") && !compressionType.equals("none")) { System.err.println("Error: \"" + compressionType + "\" unknown compression type!"); System.exit(-1); } // Default block size. int blocksize = 1000000; JobConf conf = new JobConf(RepackClueWarcRecords.class); conf.setJobName("RepackClueWarcRecords:segment" + segment); conf.set("DocnoMappingDataFile", data); LOG.info("Tool name: RepackClueWarcRecords"); LOG.info(" - base path: " + basePath); LOG.info(" - output path: " + outputPath); LOG.info(" - segment number: " + segment); LOG.info(" - docno mapping data file: " + data); LOG.info(" - compression type: " + compressionType); if (compressionType.equals("block")) { LOG.info(" - block size: " + blocksize); } int mapTasks = 10; conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); ClueCollectionPathConstants.addEnglishCollectionPart(conf, basePath, segment); SequenceFileOutputFormat.setOutputPath(conf, new Path(outputPath)); if (compressionType.equals("none")) { SequenceFileOutputFormat.setCompressOutput(conf, false); } else { SequenceFileOutputFormat.setCompressOutput(conf, true); if (compressionType.equals("record")) { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); } else { SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); conf.setInt("io.seqfile.compress.blocksize", blocksize); } } conf.setInputFormat(ClueWarcInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ClueWarcRecord.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.medline.DemoCountMedlineCitations.java
License:Apache License
/** * Runs this tool.//from w w w .j a va2 s . c o m */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String mappingFile = args[2]; sLogger.info("input: " + inputPath); sLogger.info("output dir: " + outputPath); sLogger.info("docno mapping file: " + mappingFile); JobConf conf = new JobConf(DemoCountMedlineCitations.class); conf.setJobName("DemoCountMedlineCitations"); conf.setNumReduceTasks(0); // pass in the class name as a String; this is makes the mapper general // in being able to load any collection of Indexable objects that has // docid/docno mapping specified by a DocnoMapping object conf.set("DocnoMappingClass", "edu.umd.cloud9.collection.medline.MedlineDocnoMapping"); // put the mapping file in the distributed cache so each map worker will // have it DistributedCache.addCacheFile(new URI(mappingFile), conf); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(MedlineCitationInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.trec.BuildTrecForwardIndex.java
License:Apache License
/** * Runs this tool.//from w w w .ja va 2 s.c o m */ public int run(String[] args) throws Exception { if (args.length != 4) { printUsage(); return -1; } JobConf conf = new JobConf(getConf(), BuildTrecForwardIndex.class); FileSystem fs = FileSystem.get(getConf()); String collectionPath = args[0]; String outputPath = args[1]; String indexFile = args[2]; String mappingFile = args[3]; LOG.info("Tool name: " + BuildTrecForwardIndex.class.getCanonicalName()); LOG.info(" - collection path: " + collectionPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - mapping file: " + mappingFile); conf.setJobName(BuildTrecForwardIndex.class.getSimpleName()); conf.set("mapred.child.java.opts", "-Xmx1024m"); conf.setNumReduceTasks(1); if (conf.get("mapred.job.tracker").equals("local")) { conf.set(DOCNO_MAPPING_FILE_PROPERTY, mappingFile); } else { DistributedCache.addCacheFile(new URI(mappingFile), conf); } FileInputFormat.setInputPaths(conf, new Path(collectionPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TrecDocumentInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Count.DOCS).getCounter(); String inputFile = outputPath + "/" + "part-00000"; LOG.info("Writing " + numDocs + " doc offseta to " + indexFile); FSLineReader reader = new FSLineReader(inputFile, fs); FSDataOutputStream writer = fs.create(new Path(indexFile), true); writer.writeUTF(edu.umd.cloud9.collection.trec.TrecForwardIndex.class.getCanonicalName()); writer.writeUTF(collectionPath); writer.writeInt(numDocs); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\t"); long offset = Long.parseLong(arr[1]); int len = Integer.parseInt(arr[2]); writer.writeLong(offset); writer.writeInt(len); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " docs"); } } reader.close(); writer.close(); LOG.info(cnt + " docs total. Done!"); if (numDocs != cnt) { throw new RuntimeException("Unexpected number of documents in building forward index!"); } return 0; }
From source file:edu.umd.cloud9.collection.trec.DemoCountTrecDocuments.java
License:Apache License
/** * Runs this tool./*from w ww.j a va 2 s . c om*/ */ public int run(String[] args) throws Exception { if (args.length != 3) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; String mappingFile = args[2]; LOG.info("Tool: " + DemoCountTrecDocuments.class.getCanonicalName()); LOG.info(" - input: " + inputPath); LOG.info(" - output dir: " + outputPath); LOG.info(" - docno mapping file: " + mappingFile); JobConf conf = new JobConf(getConf(), DemoCountTrecDocuments.class); conf.setJobName(DemoCountTrecDocuments.class.getSimpleName()); conf.setNumReduceTasks(0); // Pass in the class name as a String; this is makes the mapper general in being able to load // any collection of Indexable objects that has docid/docno mapping specified by a DocnoMapping // object. conf.set("DocnoMappingClass", edu.umd.cloud9.collection.trec.TrecDocnoMapping.class.getCanonicalName()); // Put the mapping file in the distributed cache so each map worker will have it. DistributedCache.addCacheFile(new URI(mappingFile), conf); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(TrecDocumentInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); JobClient.runJob(conf); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaDocnoMapping.java
License:Apache License
@SuppressWarnings("static-access") @Override//from w w w . j av a2 s .c o m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("tmp output directory") .create(OUTPUT_PATH_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file") .create(OUTPUT_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); options.addOption(KEEP_ALL_OPTION, false, "keep all pages"); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_PATH_OPTION) || !cmdline.hasOption(OUTPUT_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION); String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION); boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - output file: " + outputFile); LOG.info(" - keep all pages: " + keepAll); LOG.info(" - language: " + language); JobConf conf = new JobConf(getConf(), BuildWikipediaDocnoMapping.class); conf.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language)); conf.setBoolean(KEEP_ALL_OPTION, keepAll); if (language != null) { conf.set("wiki.language", language); } conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(WikipediaPageInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(IntWritable.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters c = job.getCounters(); long cnt = keepAll ? c.getCounter(PageTypes.TOTAL) : c.getCounter(PageTypes.ARTICLE); WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(conf), outputPath + "/part-00000", (int) cnt, outputFile); return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaForwardIndex.java
License:Apache License
@SuppressWarnings("static-access") @Override//w w w . java 2 s . c o m public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("tmp output directory") .create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg() .withDescription("two-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String outputPath = cmdline.getOptionValue(OUTPUT_OPTION); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (language.length() != 2) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), BuildWikipediaForwardIndex.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - index file: " + indexFile); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); LOG.info(" - language: " + language); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // delete the output directory if it exists already fs.delete(new Path(outputPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(outputPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF("edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex"); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } return 0; }
From source file:edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndexBuilder.java
License:Apache License
@SuppressWarnings("static-access") @Override/*from www . j a v a 2s .c om*/ public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("index file").create(INDEX_FILE_OPTION)); options.addOption(OptionBuilder .withArgName("en|sv|nl|de|fr|ru|it|es|vi|pl|ja|pt|zh|uk|ca|fa|no|fi|id|ar|sr|ko|hi|zh_yue|cs|tr") .hasArg().withDescription("two-letter or six-letter language code").create(LANGUAGE_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_FILE_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } Path inputPath = new Path(cmdline.getOptionValue(INPUT_OPTION)); String indexFile = cmdline.getOptionValue(INDEX_FILE_OPTION); String tmpPath = "tmp-" + WikipediaForwardIndexBuilder.class.getSimpleName() + "-" + RANDOM.nextInt(10000); if (!inputPath.isAbsolute()) { System.err.println("Error: " + INPUT_OPTION + " must be an absolute path!"); return -1; } String language = null; if (cmdline.hasOption(LANGUAGE_OPTION)) { language = cmdline.getOptionValue(LANGUAGE_OPTION); if (!(language.length() == 2 || language.length() == 6)) { System.err.println("Error: \"" + language + "\" unknown language!"); return -1; } } JobConf conf = new JobConf(getConf(), WikipediaForwardIndexBuilder.class); FileSystem fs = FileSystem.get(conf); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + inputPath); LOG.info(" - index file: " + indexFile); LOG.info(" - language: " + language); LOG.info("Note: This tool only works on block-compressed SequenceFiles!"); conf.setJobName(String.format("BuildWikipediaForwardIndex[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath, INDEX_FILE_OPTION, indexFile, LANGUAGE_OPTION, language)); conf.setNumReduceTasks(1); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(tmpPath)); FileOutputFormat.setCompressOutput(conf, false); if (language != null) { conf.set("wiki.language", language); } conf.setInputFormat(NoSplitSequenceFileInputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(Text.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(IdentityReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpPath), true); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int blocks = (int) counters.getCounter(Blocks.Total); LOG.info("number of blocks: " + blocks); LOG.info("Writing index file..."); LineReader reader = new LineReader(fs.open(new Path(tmpPath + "/part-00000"))); FSDataOutputStream out = fs.create(new Path(indexFile), true); out.writeUTF(edu.umd.cloud9.collection.wikipedia.WikipediaForwardIndex.class.getCanonicalName()); out.writeUTF(inputPath.toString()); out.writeInt(blocks); int cnt = 0; Text line = new Text(); while (reader.readLine(line) > 0) { String[] arr = line.toString().split("\\s+"); int docno = Integer.parseInt(arr[0]); int offset = Integer.parseInt(arr[1]); short fileno = Short.parseShort(arr[2]); out.writeInt(docno); out.writeInt(offset); out.writeShort(fileno); cnt++; if (cnt % 100000 == 0) { LOG.info(cnt + " blocks written"); } } reader.close(); out.close(); if (cnt != blocks) { throw new RuntimeException("Error: mismatch in block count!"); } // Clean up. fs.delete(new Path(tmpPath), true); return 0; }