List of usage examples for org.apache.hadoop.mapred JobConf setMapOutputKeyClass
public void setMapOutputKeyClass(Class<?> theClass)
From source file:io.fluo.stress.trie.Unique.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 1) { log.error("Usage: " + this.getClass().getSimpleName() + "<input dir>{ <input dir>}"); System.exit(-1);//from w w w. j a v a 2 s . c om } JobConf job = new JobConf(getConf()); job.setJobName(Unique.class.getName()); job.setJarByClass(Unique.class); job.setInputFormat(SequenceFileInputFormat.class); for (String arg : args) { SequenceFileInputFormat.addInputPath(job, new Path(arg)); } job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setReducerClass(UniqueReducer.class); job.setOutputFormat(NullOutputFormat.class); RunningJob runningJob = JobClient.runJob(job); runningJob.waitForCompletion(); numUnique = (int) runningJob.getCounters().getCounter(Stats.UNIQUE); log.debug("numUnique : " + numUnique); return runningJob.isSuccessful() ? 0 : -1; }
From source file:ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { // sLogger.setLevel(Level.DEBUG); sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors"); JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = getConf().get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedIntDocVectorsDirectory(); int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0); int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0); String collectionName = getConf().get("Ivory.CollectionName"); sLogger.info("Characteristics of the collection:"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info("Characteristics of the job:"); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - MinSplitSize: " + minSplitSize); String vocabFile = getConf().get("Ivory.FinalVocab"); DistributedCache.addCacheFile(new URI(vocabFile), conf); Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs")); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { sLogger.info("Output path already exists!"); return -1; }//from w ww .ja v a2s . c o m conf.setJobName("GetWeightedIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false)); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WeightedIntDocVector.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WeightedIntDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob rj = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = rj.getCounters(); long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter(); return (int) numOfDocs; }
From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.setLevel(Level.WARN); sLogger.info("PowerTool: GetWeightedIntDocVectors"); // create a new JobConf, inheriting from the configuration of this // PowerTool//from w w w.java 2 s . c om JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedIntDocVectorsDirectory(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String collectionName = conf.get("Ivory.CollectionName"); sLogger.info("Characteristics of the collection:"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info("Characteristics of the job:"); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - MinSplitSize: " + minSplitSize); String dfByIntFilePath = env.getDfByIntData(); String cfByIntFilePath = env.getCfByIntData(); /* add df table to cache */ if (!fs.exists(new Path(dfByIntFilePath))) { throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf); /* add cf table to cache */ if (!fs.exists(new Path(cfByIntFilePath))) { throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { sLogger.info("Output path already exists!"); return 0; } //fs.delete(weightedVectirsPath, true); conf.setJobName("GetWeightedIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WeightedIntDocVector.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WeightedIntDocVector.class); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.info("PowerTool: GetWeightedTermDocVectors"); JobConf conf = new JobConf(BuildWeightedTermDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = getConf().get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedTermDocVectorsDirectory(); int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0); int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0); String collectionName = getConf().get("Ivory.CollectionName"); String termsFilePath = env.getIndexTermsData(); String termsIdsFilePath = env.getIndexTermIdsData(); String termIdMappingFilePath = env.getIndexTermIdMappingData(); String dfByTermFilePath = env.getDfByTermData(); Path inputPath = new Path(env.getTermDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { //fs.delete(weightedVectorsPath, true); sLogger.info("Output path already exists!"); return 0; }/*from w w w . j a va 2 s .co m*/ /* add terms file to cache */ if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath)) || !fs.exists(new Path(termIdMappingFilePath))) { throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/" + termIdMappingFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(termsFilePath), conf); DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf); DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf); /* add df table to cache */ if (!fs.exists(new Path(dfByTermFilePath))) { throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); conf.setJobName("GetWeightedTermDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE)); conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false)); if (getConf().get("Ivory.ShortDocLengths") != null) { conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths")); } conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel")); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(HMapSFW.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HMapSFW.class); sLogger.info("Running job: " + conf.getJobName()); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.index.BuildIntPostingsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildIntPostingsForwardIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName); Path inputPath = new Path(env.getPostingsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); Path postingsIndexPath = new Path(env.getPostingsIndexData()); if (fs.exists(postingsIndexPath)) { sLogger.info("Postings forward index path already exists!"); return 0; }// w w w . j a v a 2 s . co m conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); int collectionDocCnt = env.readCollectionDocumentCount(); LOG.info("PowerTool: BuildIPInvertedIndexDocSorted"); LOG.info(" - IndexPath: " + indexPath); LOG.info(" - CollectionName: " + collectionName); LOG.info(" - CollectionDocumentCount: " + collectionDocCnt); LOG.info(" - NumMapTasks: " + mapTasks); LOG.info(" - NumReduceTasks: " + reduceTasks); LOG.info(" - MinSplitSize: " + minSplitSize); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }/*from www. j av a 2 s .c om*/ Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setJobName("BuildIPInvertedIndex:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, postingsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(TermPositions.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PostingsListDocSortedPositional.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }
From source file:ivory.preprocess.BuildIntDocVectors.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool/* ww w. ja va 2s . com*/ JobConf conf = new JobConf(getConf(), BuildIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); sLogger.info("PowerTool: BuildIntDocVectors"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info("This is new!"); String termsFile = env.getIndexTermsData(); String termIDsFile = env.getIndexTermIdsData(); String idToTermFile = env.getIndexTermIdMappingData(); Path termsFilePath = new Path(termsFile); Path termIDsFilePath = new Path(termIDsFile); if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) { sLogger.error("Error, terms files don't exist!"); return 0; } Path outputPath = new Path(env.getIntDocVectorsDirectory()); if (fs.exists(outputPath)) { sLogger.info("IntDocVectors already exist: skipping!"); return 0; } DistributedCache.addCacheFile(new URI(termsFile), conf); DistributedCache.addCacheFile(new URI(termIDsFile), conf); DistributedCache.addCacheFile(new URI(idToTermFile), conf); conf.setJobName("BuildIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, env.getTermDocVectorsDirectory()); FileOutputFormat.setOutputPath(conf, outputPath); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(LazyIntDocVector.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(LazyIntDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.preprocess.BuildIntDocVectorsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false); sLogger.info("Tool: BuildIntDocVectorsIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - BuildWeighted: " + buildWeighted); sLogger.info(" - NumMapTasks: " + mapTasks); String intDocVectorsPath;/*from w ww. j a va 2 s . c o m*/ String forwardIndexPath; if (buildWeighted) { intDocVectorsPath = env.getWeightedIntDocVectorsDirectory(); forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex(); } else { intDocVectorsPath = env.getIntDocVectorsDirectory(); forwardIndexPath = env.getIntDocVectorsForwardIndex(); } if (!fs.exists(new Path(intDocVectorsPath))) { sLogger.info("Error: IntDocVectors don't exist!"); return 0; } if (fs.exists(new Path(forwardIndexPath))) { sLogger.info("IntDocVectorIndex already exists: skipping!"); return 0; } conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName); Path inputPath = new Path(intDocVectorsPath); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.preprocess.BuildTermDocVectors.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool/* w w w . j a v a 2 s .co m*/ JobConf conf = new JobConf(getConf(), BuildTermDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = conf.get("Ivory.CollectionName"); String collectionPath = conf.get("Ivory.CollectionPath"); String inputFormat = conf.get("Ivory.InputFormat"); String tokenizer = conf.get("Ivory.Tokenizer"); String mappingClass = conf.get("Ivory.DocnoMappingClass"); sLogger.info("PowerTool: BuildTermDocVectors"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - CollectionPath: " + collectionPath); sLogger.info(" - InputputFormat: " + inputFormat); sLogger.info(" - Tokenizer: " + tokenizer); sLogger.info(" - DocnoMappingClass: " + mappingClass); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + 0); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { sLogger.error("Error, docno mapping data file " + mappingFile + "doesn't exist!"); return 0; } DistributedCache.addCacheFile(mappingFile.toUri(), conf); conf.setJobName("BuildTermDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); if (collectionPath.indexOf(",") == -1) { FileInputFormat.setInputPaths(conf, new Path(collectionPath)); sLogger.info("Adding input path " + collectionPath); } else { String[] paths = collectionPath.split(","); for (String p : paths) { FileInputFormat.addInputPath(conf, new Path(p)); sLogger.info("Adding input path " + p); } } Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { sLogger.info("TermDocVectors already exist: Skipping!"); } else { env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat((Class<? extends InputFormat>) Class.forName(inputFormat)); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(LazyTermDocVector.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(LazyTermDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // write out number of postings int collectionDocCount = (int) counters.findCounter(Docs.Total).getCounter(); env.writeCollectionDocumentCount(collectionDocCount); } if (fs.exists(env.getDoclengthsData())) { sLogger.info("DocLength data exists: Skipping!"); return 0; } int collectionDocCount = env.readCollectionDocumentCount(); long startTime = System.currentTimeMillis(); writeDoclengthsData(collectionDocCount); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildTermDocVectorsIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) { sLogger.info("Error: TermDocVectors don't exist!"); return 0; }//from w w w . j a va 2 s . c o m if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) { sLogger.info("TermDocVectorIndex already exists: skipping!"); return 0; } conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName); Path inputPath = new Path(env.getTermDocVectorsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }