List of usage examples for org.apache.hadoop.mapred JobConf setOutputFormat
public void setOutputFormat(Class<? extends OutputFormat> theClass)
From source file:ivory.preprocess.BuildIntDocVectorsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false); sLogger.info("Tool: BuildIntDocVectorsIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - BuildWeighted: " + buildWeighted); sLogger.info(" - NumMapTasks: " + mapTasks); String intDocVectorsPath;/*from w w w .j av a 2 s . c om*/ String forwardIndexPath; if (buildWeighted) { intDocVectorsPath = env.getWeightedIntDocVectorsDirectory(); forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex(); } else { intDocVectorsPath = env.getIntDocVectorsDirectory(); forwardIndexPath = env.getIntDocVectorsForwardIndex(); } if (!fs.exists(new Path(intDocVectorsPath))) { sLogger.info("Error: IntDocVectors don't exist!"); return 0; } if (fs.exists(new Path(forwardIndexPath))) { sLogger.info("IntDocVectorIndex already exists: skipping!"); return 0; } conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName); Path inputPath = new Path(intDocVectorsPath); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.preprocess.BuildTermDocVectors.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool// w w w . j a v a 2 s. c o m JobConf conf = new JobConf(getConf(), BuildTermDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = conf.get("Ivory.CollectionName"); String collectionPath = conf.get("Ivory.CollectionPath"); String inputFormat = conf.get("Ivory.InputFormat"); String tokenizer = conf.get("Ivory.Tokenizer"); String mappingClass = conf.get("Ivory.DocnoMappingClass"); sLogger.info("PowerTool: BuildTermDocVectors"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - CollectionPath: " + collectionPath); sLogger.info(" - InputputFormat: " + inputFormat); sLogger.info(" - Tokenizer: " + tokenizer); sLogger.info(" - DocnoMappingClass: " + mappingClass); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + 0); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { sLogger.error("Error, docno mapping data file " + mappingFile + "doesn't exist!"); return 0; } DistributedCache.addCacheFile(mappingFile.toUri(), conf); conf.setJobName("BuildTermDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); if (collectionPath.indexOf(",") == -1) { FileInputFormat.setInputPaths(conf, new Path(collectionPath)); sLogger.info("Adding input path " + collectionPath); } else { String[] paths = collectionPath.split(","); for (String p : paths) { FileInputFormat.addInputPath(conf, new Path(p)); sLogger.info("Adding input path " + p); } } Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { sLogger.info("TermDocVectors already exist: Skipping!"); } else { env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat((Class<? extends InputFormat>) Class.forName(inputFormat)); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(LazyTermDocVector.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(LazyTermDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // write out number of postings int collectionDocCount = (int) counters.findCounter(Docs.Total).getCounter(); env.writeCollectionDocumentCount(collectionDocCount); } if (fs.exists(env.getDoclengthsData())) { sLogger.info("DocLength data exists: Skipping!"); return 0; } int collectionDocCount = env.readCollectionDocumentCount(); long startTime = System.currentTimeMillis(); writeDoclengthsData(collectionDocCount); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.preprocess.BuildTermDocVectors.java
License:Apache License
private void writeDoclengthsData(int collectionDocCount) throws IOException { JobConf conf = new JobConf(getConf(), GetTermCount.class); String indexPath = conf.get("Ivory.IndexPath"); String collectionName = conf.get("Ivory.CollectionName"); int docnoOffset = conf.getInt("Ivory.DocnoOffset", 0); FileSystem fs = FileSystem.get(conf); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path dlFile = env.getDoclengthsData(); Path inputPath = env.getDoclengthsDirectory(); sLogger.info("Writing doc length data to " + dlFile + "..."); conf.setJobName("DocLengthTable:" + collectionName); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCount); conf.set("InputPath", inputPath.toString()); conf.set("DocLengthDataFile", dlFile.toString()); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setNumMapTasks(1);/*from w w w . j a v a 2s .c om*/ conf.setNumReduceTasks(0); conf.setSpeculativeExecution(false); conf.setInputFormat(NullInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(DocLengthDataWriterMapper.class); RunningJob job = JobClient.runJob(conf); env.writeDocnoOffset(docnoOffset); Counters counters = job.getCounters(); long collectionSumOfDocLengths = (long) counters.findCounter(DocLengths.SumOfDocLengths).getCounter(); env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount); }
From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildTermDocVectorsIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) { sLogger.info("Error: TermDocVectors don't exist!"); return 0; }/* w ww . j a va2 s . com*/ if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) { sLogger.info("TermDocVectorIndex already exists: skipping!"); return 0; } conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName); Path inputPath = new Path(env.getTermDocVectorsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.preprocess.BuildTermIdMap.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool/* w w w . j a v a 2 s . co m*/ JobConf conf = new JobConf(getConf(), BuildTermIdMap.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); String collectionName = conf.get("Ivory.CollectionName"); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = 1; int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); sLogger.info("PowerTool: BuildTermIdMap"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); if (!fs.exists(new Path(indexPath))) { sLogger.error("index path doesn't existing: skipping!"); return 0; } Path termsFilePath = new Path(env.getIndexTermsData()); Path termIDsFilePath = new Path(env.getIndexTermIdsData()); Path idToTermFilePath = new Path(env.getIndexTermIdMappingData()); Path dfByTermFilePath = new Path(env.getDfByTermData()); Path cfByTermFilePath = new Path(env.getCfByTermData()); Path dfByIntFilePath = new Path(env.getDfByIntData()); Path cfByIntFilePath = new Path(env.getCfByIntData()); if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath) || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath) || fs.exists(cfByIntFilePath)) { sLogger.info("term and term id data exist: skipping!"); return 0; } Path tmpPath = new Path(env.getTempDirectory()); fs.delete(tmpPath, true); conf.setJobName("BuildTermIdMap:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionTermCount", (int) env.readCollectionTermCount()); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, new Path(env.getTermDfCfDirectory())); FileOutputFormat.setOutputPath(conf, tmpPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfIntLong.class); conf.setOutputKeyClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); fs.delete(tmpPath, true); return 0; }
From source file:ivory.preprocess.GetTermCount.java
License:Apache License
public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool/*from w w w. j a v a 2 s.c o m*/ JobConf conf = new JobConf(getConf(), GetTermCount.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt(Constants.NumMapTasks, 0); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { sLogger.info("index path doesn't existing: skipping!"); return 0; } sLogger.info("PowerTool: GetTermCount"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0)); sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { sLogger.error("TermDfCf directory exist: skipping!"); return 0; } conf.setJobName("GetTermCount:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfIntLong.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setMapperClass(MyMapper.class); conf.setCombinerClass(MyCombiner.class); conf.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // write out number of postings int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter(); env.writeCollectionTermCount(collectionTermCount); // NOTE: this value is not the same as number of postings, because // postings for non-English terms are discarded, or as result of df cut long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter(); env.writeCollectionLength(collectionLength); return 0; }
From source file:ivory.ptc.AnchorTextInvertedIndex.java
License:Apache License
@Override public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class); FileSystem fs = FileSystem.get(conf); String inPath = conf.get("Ivory.InputPath"); String outPath = conf.get("Ivory.OutputPath"); Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = conf.getInt("Ivory.NumMapTasks", 1); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100); String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters"); LOG.info("BuildAnchorTextInvertedIndex"); LOG.info(" - input path: " + inPath); LOG.info(" - output path: " + outPath); LOG.info(" - number of reducers: " + reduceTasks); LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme")); LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters); String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER); for (String param : params) { DistributedCache.addCacheFile(new URI(param), conf); }//from w w w. ja v a 2s . c o m conf.setJobName("BuildAnchorTextInvertedIndex"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setInt("mapred.task.timeout", 60000000); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(AnchorTextTarget.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); fs.delete(outputPath); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.driver.XMLFormatJudgments.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { printUsage();/*w ww . j a v a 2 s . com*/ return -1; } JobConf conf = new JobConf(getConf(), XMLFormatJudgments.class); // Command line arguments String inPath = args[0]; String outPath = args[1]; String docnoMapping = args[2]; Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; conf.setJobName("FormatPseudoJudgments"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); DistributedCache.addCacheFile(new URI(docnoMapping), conf); FileSystem.get(conf).delete(outputPath); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.driver.XMLFormatQueries.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { printUsage();/*from w ww .j a v a 2 s . c o m*/ return -1; } JobConf conf = new JobConf(getConf(), XMLFormatQueries.class); // Command line arguments String inPath = args[0]; String outPath = args[1]; Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; conf.setJobName("FormatPseudoQueries"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileSystem.get(conf).delete(outputPath); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.SortedPseudoTestCollection.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), SortedPseudoTestCollection.class); FileSystem fs = FileSystem.get(conf); String inPath = conf.get("Ivory.InputPath"); String outPath = conf.get("Ivory.OutputPath"); Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; LOG.info("SortedPseudoTestCollection"); LOG.info(" - Input path: " + conf.get("Ivory.InputPath")); LOG.info(" - Output path: " + conf.get("Ivory.OutputPath")); LOG.info(" - JudgmentExtractor: " + conf.get("Ivory.JudgmentExtractor")); LOG.info(" - JudgmentExtractorParameters: " + conf.get("Ivory.JudgmentExtractorParameters")); LOG.info(" - SamplingCriterion: " + conf.get("Ivory.SamplingCriterion")); LOG.info(" - SamplingCriterionParameters: " + conf.get("Ivory.SamplingCriterionParameters")); LOG.info(" - QueryScorer: " + conf.get("Ivory.QueryScorer")); conf.setJobName("SortedPTC"); conf.setNumMapTasks(mapTasks);/*from ww w .j a v a 2 s . c om*/ conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx4096m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(PseudoQuery.class); conf.setOutputValueClass(PseudoJudgments.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); fs.delete(outputPath); JobClient.runJob(conf); return 0; }