List of usage examples for org.apache.hadoop.mapred JobConf setOutputKeyClass
public void setOutputKeyClass(Class<?> theClass)
From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.info("PowerTool: GetWeightedTermDocVectors"); JobConf conf = new JobConf(BuildWeightedTermDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = getConf().get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedTermDocVectorsDirectory(); int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0); int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0); String collectionName = getConf().get("Ivory.CollectionName"); String termsFilePath = env.getIndexTermsData(); String termsIdsFilePath = env.getIndexTermIdsData(); String termIdMappingFilePath = env.getIndexTermIdMappingData(); String dfByTermFilePath = env.getDfByTermData(); Path inputPath = new Path(env.getTermDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { //fs.delete(weightedVectorsPath, true); sLogger.info("Output path already exists!"); return 0; }/* w w w . j av a 2 s. com*/ /* add terms file to cache */ if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath)) || !fs.exists(new Path(termIdMappingFilePath))) { throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/" + termIdMappingFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(termsFilePath), conf); DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf); DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf); /* add df table to cache */ if (!fs.exists(new Path(dfByTermFilePath))) { throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); conf.setJobName("GetWeightedTermDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE)); conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false)); if (getConf().get("Ivory.ShortDocLengths") != null) { conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths")); } conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel")); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(HMapSFW.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HMapSFW.class); sLogger.info("Running job: " + conf.getJobName()); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.index.BuildIntPostingsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildIntPostingsForwardIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName); Path inputPath = new Path(env.getPostingsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); Path postingsIndexPath = new Path(env.getPostingsIndexData()); if (fs.exists(postingsIndexPath)) { sLogger.info("Postings forward index path already exists!"); return 0; }//from ww w . ja v a2 s. c o m conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); int collectionDocCnt = env.readCollectionDocumentCount(); LOG.info("PowerTool: BuildIPInvertedIndexDocSorted"); LOG.info(" - IndexPath: " + indexPath); LOG.info(" - CollectionName: " + collectionName); LOG.info(" - CollectionDocumentCount: " + collectionDocCnt); LOG.info(" - NumMapTasks: " + mapTasks); LOG.info(" - NumReduceTasks: " + reduceTasks); LOG.info(" - MinSplitSize: " + minSplitSize); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }//from w ww.jav a 2s. c o m Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setJobName("BuildIPInvertedIndex:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, postingsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(TermPositions.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PostingsListDocSortedPositional.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }
From source file:ivory.preprocess.BuildIntDocVectors.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool//from w w w. ja va2s .c o m JobConf conf = new JobConf(getConf(), BuildIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = env.readCollectionName(); sLogger.info("PowerTool: BuildIntDocVectors"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info("This is new!"); String termsFile = env.getIndexTermsData(); String termIDsFile = env.getIndexTermIdsData(); String idToTermFile = env.getIndexTermIdMappingData(); Path termsFilePath = new Path(termsFile); Path termIDsFilePath = new Path(termIDsFile); if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) { sLogger.error("Error, terms files don't exist!"); return 0; } Path outputPath = new Path(env.getIntDocVectorsDirectory()); if (fs.exists(outputPath)) { sLogger.info("IntDocVectors already exist: skipping!"); return 0; } DistributedCache.addCacheFile(new URI(termsFile), conf); DistributedCache.addCacheFile(new URI(termIDsFile), conf); DistributedCache.addCacheFile(new URI(idToTermFile), conf); conf.setJobName("BuildIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); FileInputFormat.setInputPaths(conf, env.getTermDocVectorsDirectory()); FileOutputFormat.setOutputPath(conf, outputPath); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(LazyIntDocVector.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(LazyIntDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.preprocess.BuildTermDocVectors.java
License:Apache License
@SuppressWarnings("unchecked") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool/*from ww w . ja v a 2s.c om*/ JobConf conf = new JobConf(getConf(), BuildTermDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); String collectionName = conf.get("Ivory.CollectionName"); String collectionPath = conf.get("Ivory.CollectionPath"); String inputFormat = conf.get("Ivory.InputFormat"); String tokenizer = conf.get("Ivory.Tokenizer"); String mappingClass = conf.get("Ivory.DocnoMappingClass"); sLogger.info("PowerTool: BuildTermDocVectors"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - CollectionPath: " + collectionPath); sLogger.info(" - InputputFormat: " + inputFormat); sLogger.info(" - Tokenizer: " + tokenizer); sLogger.info(" - DocnoMappingClass: " + mappingClass); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + 0); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); Path mappingFile = env.getDocnoMappingData(); if (!fs.exists(mappingFile)) { sLogger.error("Error, docno mapping data file " + mappingFile + "doesn't exist!"); return 0; } DistributedCache.addCacheFile(mappingFile.toUri(), conf); conf.setJobName("BuildTermDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); if (collectionPath.indexOf(",") == -1) { FileInputFormat.setInputPaths(conf, new Path(collectionPath)); sLogger.info("Adding input path " + collectionPath); } else { String[] paths = collectionPath.split(","); for (String p : paths) { FileInputFormat.addInputPath(conf, new Path(p)); sLogger.info("Adding input path " + p); } } Path outputPath = new Path(env.getTermDocVectorsDirectory()); if (fs.exists(outputPath)) { sLogger.info("TermDocVectors already exist: Skipping!"); } else { env.writeCollectionName(collectionName); env.writeCollectionPath(collectionPath); env.writeInputFormat(inputFormat); env.writeDocnoMappingClass(mappingClass); env.writeTokenizerClass(tokenizer); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat((Class<? extends InputFormat>) Class.forName(inputFormat)); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(LazyTermDocVector.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(LazyTermDocVector.class); conf.setMapperClass(MyMapper.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // write out number of postings int collectionDocCount = (int) counters.findCounter(Docs.Total).getCounter(); env.writeCollectionDocumentCount(collectionDocCount); } if (fs.exists(env.getDoclengthsData())) { sLogger.info("DocLength data exists: Skipping!"); return 0; } int collectionDocCount = env.readCollectionDocumentCount(); long startTime = System.currentTimeMillis(); writeDoclengthsData(collectionDocCount); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.preprocess.BuildTermIdMap.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool//from ww w . j av a2 s . co m JobConf conf = new JobConf(getConf(), BuildTermIdMap.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); String collectionName = conf.get("Ivory.CollectionName"); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = 1; int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); sLogger.info("PowerTool: BuildTermIdMap"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); if (!fs.exists(new Path(indexPath))) { sLogger.error("index path doesn't existing: skipping!"); return 0; } Path termsFilePath = new Path(env.getIndexTermsData()); Path termIDsFilePath = new Path(env.getIndexTermIdsData()); Path idToTermFilePath = new Path(env.getIndexTermIdMappingData()); Path dfByTermFilePath = new Path(env.getDfByTermData()); Path cfByTermFilePath = new Path(env.getCfByTermData()); Path dfByIntFilePath = new Path(env.getDfByIntData()); Path cfByIntFilePath = new Path(env.getCfByIntData()); if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath) || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath) || fs.exists(cfByIntFilePath)) { sLogger.info("term and term id data exist: skipping!"); return 0; } Path tmpPath = new Path(env.getTempDirectory()); fs.delete(tmpPath, true); conf.setJobName("BuildTermIdMap:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionTermCount", (int) env.readCollectionTermCount()); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, new Path(env.getTermDfCfDirectory())); FileOutputFormat.setOutputPath(conf, tmpPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfIntLong.class); conf.setOutputKeyClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); fs.delete(tmpPath, true); return 0; }
From source file:ivory.preprocess.GetTermCount.java
License:Apache License
public int runTool() throws Exception { // create a new JobConf, inheriting from the configuration of this // PowerTool/*from ww w . j ava 2s .co m*/ JobConf conf = new JobConf(getConf(), GetTermCount.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); int mapTasks = conf.getInt(Constants.NumMapTasks, 0); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); String collectionName = env.readCollectionName(); String termDocVectorsPath = env.getTermDocVectorsDirectory(); String termDfCfPath = env.getTermDfCfDirectory(); if (!fs.exists(new Path(indexPath))) { sLogger.info("index path doesn't existing: skipping!"); return 0; } sLogger.info("PowerTool: GetTermCount"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - NumReduceTasks: " + reduceTasks); sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0)); sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE)); Path outputPath = new Path(termDfCfPath); if (fs.exists(outputPath)) { sLogger.error("TermDfCf directory exist: skipping!"); return 0; } conf.setJobName("GetTermCount:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath)); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(PairOfIntLong.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setMapperClass(MyMapper.class); conf.setCombinerClass(MyCombiner.class); conf.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); // write out number of postings int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter(); env.writeCollectionTermCount(collectionTermCount); // NOTE: this value is not the same as number of postings, because // postings for non-English terms are discarded, or as result of df cut long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter(); env.writeCollectionLength(collectionLength); return 0; }
From source file:ivory.ptc.AnchorTextInvertedIndex.java
License:Apache License
@Override public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), AnchorTextInvertedIndex.class); FileSystem fs = FileSystem.get(conf); String inPath = conf.get("Ivory.InputPath"); String outPath = conf.get("Ivory.OutputPath"); Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = conf.getInt("Ivory.NumMapTasks", 1); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 100); String weightingSchemeParameters = conf.get("Ivory.WeightingSchemeParameters"); LOG.info("BuildAnchorTextInvertedIndex"); LOG.info(" - input path: " + inPath); LOG.info(" - output path: " + outPath); LOG.info(" - number of reducers: " + reduceTasks); LOG.info(" - weighting scheme: " + conf.get("Ivory.WeightingScheme")); LOG.info(" - weighting scheme parameters: " + weightingSchemeParameters); String[] params = weightingSchemeParameters.split(PARAMETER_SEPARATER); for (String param : params) { DistributedCache.addCacheFile(new URI(param), conf); }// ww w .ja v a2s . c o m conf.setJobName("BuildAnchorTextInvertedIndex"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setInt("mapred.task.timeout", 60000000); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(AnchorTextTarget.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); fs.delete(outputPath); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.driver.XMLFormatJudgments.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { printUsage();//w ww .ja v a 2s . c o m return -1; } JobConf conf = new JobConf(getConf(), XMLFormatJudgments.class); // Command line arguments String inPath = args[0]; String outPath = args[1]; String docnoMapping = args[2]; Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; conf.setJobName("FormatPseudoJudgments"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); DistributedCache.addCacheFile(new URI(docnoMapping), conf); FileSystem.get(conf).delete(outputPath); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.ptc.driver.XMLFormatQueries.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 2) { printUsage();/*from ww w . j av a 2s. co m*/ return -1; } JobConf conf = new JobConf(getConf(), XMLFormatQueries.class); // Command line arguments String inPath = args[0]; String outPath = args[1]; Path inputPath = new Path(inPath); Path outputPath = new Path(outPath); int mapTasks = 1; int reduceTasks = 1; conf.setJobName("FormatPseudoQueries"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileSystem.get(conf).delete(outputPath); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(PseudoQuery.class); conf.setMapOutputValueClass(PseudoJudgments.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }