List of usage examples for org.apache.hadoop.mapred JobConf setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> theClass)
From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java
License:Apache License
private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner, boolean useRange) throws IOException { JobConf conf = new JobConf(RunPageRankBasic.class); String in = path + "/iter" + sFormat.format(i); String out = path + "/iter" + sFormat.format(j) + "t"; String outm = out + "-mass"; FileSystem fs = FileSystem.get(conf); // we need to actually count the number of part files to get the number // of partitions (because the directory might contain _log) int numPartitions = 0; for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) { if (s.getPath().getName().contains("part-")) numPartitions++;//from w w w. j ava 2s. c om } conf.setInt("NodeCount", n); Partitioner p = null; if (useRange) { p = new RangePartitioner<IntWritable, Writable>(); p.configure(conf); } else { p = new HashPartitioner<WritableComparable, Writable>(); } // this is really annoying: the mapping between the partition numbers on // disk (i.e., part-XXXX) and what partition the file contains (i.e., // key.hash % #reducer) is arbitrary... so this means that we need to // open up each partition, peek inside to find out. IntWritable key = new IntWritable(); PageRankNode value = new PageRankNode(); FileStatus[] status = fs.listStatus(new Path(in)); StringBuilder sb = new StringBuilder(); for (FileStatus f : status) { if (f.getPath().getName().contains("_logs")) continue; SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf); reader.next(key, value); int np = p.getPartition(key, value, numPartitions); reader.close(); sLogger.info(f.getPath() + "\t" + np); sb.append(np + "=" + f.getPath() + "\t"); } sLogger.info(sb.toString().trim()); sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1"); sLogger.info(" - input: " + in); sLogger.info(" - output: " + out); sLogger.info(" - nodeCnt: " + n); sLogger.info(" - useCombiner: " + useCombiner); sLogger.info(" - useInmapCombiner: " + useInmapCombiner); sLogger.info(" - numPartitions: " + numPartitions); sLogger.info(" - useRange: " + useRange); sLogger.info("computed number of partitions: " + numPartitions); int numMapTasks = numPartitions; int numReduceTasks = numPartitions; conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1"); conf.setNumMapTasks(numMapTasks); conf.setNumReduceTasks(numReduceTasks); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.set("PageRankMassPath", outm); conf.set("BasePath", in); conf.set("PartitionMapping", sb.toString().trim()); FileInputFormat.setInputPaths(conf, new Path(in)); FileOutputFormat.setOutputPath(conf, new Path(out)); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(FloatWritable.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PageRankNode.class); if (useInmapCombiner) { conf.setMapperClass(MapWithInMapperCombiningClass.class); } else { conf.setMapperClass(MapClass.class); } if (useCombiner) { conf.setCombinerClass(CombineClass.class); } if (useRange) { conf.setPartitionerClass(RangePartitioner.class); } conf.setReducerClass(ReduceClass.class); conf.setSpeculativeExecution(false); FileSystem.get(conf).delete(new Path(out), true); FileSystem.get(conf).delete(new Path(outm), true); JobClient.runJob(conf); float mass = Float.NEGATIVE_INFINITY; for (FileStatus f : fs.listStatus(new Path(outm))) { FSDataInputStream fin = fs.open(f.getPath()); mass = sumLogProbs(mass, fin.readFloat()); fin.close(); } return mass; }
From source file:edu.umd.cloud9.webgraph.CollectHostnames.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), CollectHostnames.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); conf.setJobName("CollectHostnames"); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setInt("mapred.task.timeout", 60000000); conf.setNumMapTasks(numMappers);// ww w. j av a2 s . c o m conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setPartitionerClass(Partition.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(PairOfIntString.class); conf.setMapOutputValueClass(IntWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); sLogger.info("PropagateHostname"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { sLogger.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umd.cloud9.webgraph.ComputeWeight.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), ComputeWeight.class); FileSystem fs = FileSystem.get(conf); int numMappers = conf.getInt("Cloud9.Mappers", 1); int numReducers = conf.getInt("Cloud9.Reducers", 200); String inputPath = conf.get("Cloud9.InputPath"); String outputPath = conf.get("Cloud9.OutputPath"); conf.setJobName("ComputeWeights"); conf.set("mapred.child.java.opts", "-Xmx4096m"); conf.setInt("mapred.task.timeout", 60000000); conf.setNumMapTasks(numMappers);//w w w . j av a 2 s. c o m conf.setNumReduceTasks(numReducers); conf.setMapperClass(Map.class); conf.setPartitionerClass(Partition.class); conf.setReducerClass(Reduce.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("ComputeWeight"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); if (!fs.exists(new Path(outputPath))) { JobClient.runJob(conf); } else { LOG.info(outputPath + " already exists! Skipping this step..."); } return 0; }
From source file:edu.umd.cloud9.webgraph.driver.SortWebGraph.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 4) { printUsage();/*from ww w . j a v a 2 s . c o m*/ return -1; } JobConf conf = new JobConf(getConf(), SortWebGraph.class); FileSystem fs = FileSystem.get(conf); String inputPath = args[0]; String outputPath = args[1]; int numberOfDocuments = Integer.parseInt(args[2]); int numMappers = 1; int numReducers = Integer.parseInt(args[3]); conf.setJobName("SortWebGraph"); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("mapred.task.timeout", 60000000); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); conf.set("mapreduce.task.timeout", "60000000"); if (numberOfDocuments == 0) { numberOfDocuments = DEFAULT_NUMBER_OF_DOCUMENTS; } conf.setInt("Cloud9.NumberOfDocuments", numberOfDocuments); conf.setNumMapTasks(numMappers); conf.setNumReduceTasks(numReducers); conf.setMapperClass(IdentityMapper.class); conf.setPartitionerClass(Partition.class); conf.setReducerClass(IdentityReducer.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(ArrayListWritable.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(ArrayListWritable.class); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setCompressOutput(conf, true); SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK); SequenceFileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); LOG.info("SortAnchorText"); LOG.info(" - input path: " + inputPath); LOG.info(" - output path: " + outputPath); LOG.info(" - number of documents: " + conf.getInt("Cloud9.NumberOfDocuments", DEFAULT_NUMBER_OF_DOCUMENTS)); fs.delete(new Path(outputPath)); JobClient.runJob(conf); return 0; }
From source file:IndexService.IndexMergeMR.java
License:Open Source License
public static RunningJob run(String inputfiles, String outputdir, Configuration conf) { if (inputfiles == null || outputdir == null) return null; JobConf job = new JobConf(conf); job.setJobName("MergeIndexMR"); job.setJarByClass(IndexMergeMR.class); job.setNumReduceTasks(1);//from w w w. ja va 2s. co m FileSystem fs = null; try { fs = FileSystem.get(job); fs.delete(new Path(outputdir), true); String[] ifs = inputfiles.split(","); TreeSet<String> files = new TreeSet<String>(); for (int i = 0; i < ifs.length; i++) { IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(ifs[i]); Collection<String> strs = ifdf.fileInfo().head().getUdi().infos().values(); for (String str : strs) { files.add(str); } ifdf.close(); } StringBuffer sb = new StringBuffer(); for (String str : files) { sb.append(str + ","); } job.set(ConstVar.HD_index_filemap, sb.substring(0, sb.length() - 1)); IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(ifs[0]); HashMap<Integer, IRecord.IFType> map = ifdf.fileInfo().head().fieldMap().fieldtypes(); ArrayList<String> fieldStrings = new ArrayList<String>(); for (int i = 0; i < map.size(); i++) { IRecord.IFType type = map.get(i); fieldStrings.add(type.type() + ConstVar.RecordSplit + type.idx()); } job.setStrings(ConstVar.HD_fieldMap, fieldStrings.toArray(new String[fieldStrings.size()])); job.set("datafiletype", ifdf.fileInfo().head().getUdi().infos().get(123456)); ifdf.close(); } catch (Exception e2) { e2.printStackTrace(); } FileInputFormat.setInputPaths(job, inputfiles); FileOutputFormat.setOutputPath(job, new Path(outputdir)); job.setOutputKeyClass(IndexKey.class); job.setOutputValueClass(IndexValue.class); job.setPartitionerClass(IndexMergePartitioner.class); job.setMapperClass(MergeIndexMap.class); job.setCombinerClass(MergeIndexReduce.class); job.setReducerClass(MergeIndexReduce.class); job.setInputFormat(IndexMergeIFormatInputFormat.class); job.setOutputFormat(IndexMergeIFormatOutputFormat.class); try { JobClient jc = new JobClient(job); return jc.submitJob(job); } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:IndexService.IndexMR.java
License:Open Source License
public static RunningJob run(Configuration conf2, String inputfiles, boolean column, String ids, String outputdir) {/*from ww w.j av a 2 s . com*/ if (inputfiles == null || outputdir == null) return null; JobConf conf = new JobConf(conf2); conf.setJobName("IndexMR:\t" + ids); conf.setJarByClass(IndexMR.class); FileSystem fs = null; try { fs = FileSystem.get(conf); fs.delete(new Path(outputdir), true); } catch (IOException e3) { e3.printStackTrace(); } conf.set("index.ids", ids); if (column) { conf.set("datafiletype", "column"); } else { conf.set("datafiletype", "format"); } String[] ifs = inputfiles.split(","); long wholerecnum = 0; String[] idxs = ids.split(","); String[] fieldStrings = new String[idxs.length + 2]; if (!column) { IFormatDataFile ifdf; try { ifdf = new IFormatDataFile(conf); ifdf.open(ifs[0]); for (int i = 0; i < idxs.length; i++) { int id = Integer.parseInt(idxs[i]); byte type = ifdf.fileInfo().head().fieldMap().fieldtypes().get(id).type(); fieldStrings[i] = type + ConstVar.RecordSplit + i; } ifdf.close(); } catch (IOException e) { e.printStackTrace(); } } else { try { IColumnDataFile icdf = new IColumnDataFile(conf); icdf.open(ifs[0]); for (int i = 0; i < idxs.length; i++) { int id = Integer.parseInt(idxs[i]); byte type = icdf.fieldtypes().get(id).type(); fieldStrings[i] = type + ConstVar.RecordSplit + i; } icdf.close(); } catch (IOException e) { e.printStackTrace(); } } fieldStrings[fieldStrings.length - 2] = ConstVar.FieldType_Short + ConstVar.RecordSplit + (fieldStrings.length - 2); fieldStrings[fieldStrings.length - 1] = ConstVar.FieldType_Int + ConstVar.RecordSplit + (fieldStrings.length - 1); conf.setStrings(ConstVar.HD_fieldMap, fieldStrings); if (!column) { conf.set(ConstVar.HD_index_filemap, inputfiles); for (String file : ifs) { IFormatDataFile fff; try { fff = new IFormatDataFile(conf); fff.open(file); wholerecnum += fff.segIndex().recnum(); fff.close(); } catch (IOException e) { e.printStackTrace(); } } } else { HashSet<String> files = new HashSet<String>(); for (String file : ifs) { files.add(file); } StringBuffer sb = new StringBuffer(); for (String str : files) { sb.append(str).append(","); } conf.set(ConstVar.HD_index_filemap, sb.substring(0, sb.length() - 1)); for (String file : files) { Path parent = new Path(file).getParent(); try { FileStatus[] fss = fs.listStatus(parent); String openfile = ""; for (FileStatus status : fss) { if (status.getPath().toString().contains(file)) { openfile = status.getPath().toString(); break; } } IFormatDataFile fff = new IFormatDataFile(conf); fff.open(openfile); wholerecnum += fff.segIndex().recnum(); fff.close(); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } } } conf.setNumReduceTasks((int) ((wholerecnum - 1) / (100000000) + 1)); FileInputFormat.setInputPaths(conf, inputfiles); Path outputPath = new Path(outputdir); FileOutputFormat.setOutputPath(conf, outputPath); conf.setOutputKeyClass(IndexKey.class); conf.setOutputValueClass(IndexValue.class); conf.setPartitionerClass(IndexPartitioner.class); conf.setMapperClass(IndexMap.class); conf.setCombinerClass(IndexReduce.class); conf.setReducerClass(IndexReduce.class); if (column) { conf.setInputFormat(IColumnInputFormat.class); } else { conf.setInputFormat(IFormatInputFormat.class); } conf.setOutputFormat(IndexIFormatOutputFormat.class); try { JobClient jc = new JobClient(conf); return jc.submitJob(conf); } catch (IOException e) { e.printStackTrace(); return null; } }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
private static void setupPipesJob(JobConf conf) throws IOException { // default map output types to Text if (!getIsJavaMapper(conf)) { conf.setMapRunnerClass(PipesMapRunner.class); // Save the user's partitioner and hook in our's. setJavaPartitioner(conf, conf.getPartitionerClass()); conf.setPartitionerClass(PipesPartitioner.class); }/*from ww w . j a va2 s . c o m*/ if (!getIsJavaReducer(conf)) { conf.setReducerClass(PipesReducer.class); if (!getIsJavaRecordWriter(conf)) { conf.setOutputFormat(NullOutputFormat.class); } } String textClassname = Text.class.getName(); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname); setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname); // Use PipesNonJavaInputFormat if necessary to handle progress reporting // from C++ RecordReaders ... if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) { conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class); conf.setInputFormat(PipesNonJavaInputFormat.class); } String exec = getExecutable(conf); if (exec == null) { throw new IllegalArgumentException("No application program defined."); } // add default debug script only when executable is expressed as // <path>#<executable> if (exec.contains("#")) { // set default gdb commands for map and reduce task String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script"; setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript); setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript); } URI[] fileCache = DistributedCache.getCacheFiles(conf); if (fileCache == null) { fileCache = new URI[1]; } else { URI[] tmp = new URI[fileCache.length + 1]; System.arraycopy(fileCache, 0, tmp, 1, fileCache.length); fileCache = tmp; } try { fileCache[0] = new URI(exec); } catch (URISyntaxException e) { IOException ie = new IOException("Problem parsing execable URI " + exec); ie.initCause(e); throw ie; } DistributedCache.setCacheFiles(fileCache, conf); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
@Override public int run(String[] args) throws Exception { CommandLineParser cli = new CommandLineParser(); if (args.length == 0) { cli.printUsage();//from w ww. ja v a2s.com return 1; } cli.addOption("input", false, "input path to the maps", "path"); cli.addOption("output", false, "output path from the reduces", "path"); cli.addOption("jar", false, "job jar file", "path"); cli.addOption("inputformat", false, "java classname of InputFormat", "class"); //cli.addArgument("javareader", false, "is the RecordReader in Java"); cli.addOption("map", false, "java classname of Mapper", "class"); cli.addOption("partitioner", false, "java classname of Partitioner", "class"); cli.addOption("reduce", false, "java classname of Reducer", "class"); cli.addOption("writer", false, "java classname of OutputFormat", "class"); cli.addOption("program", false, "URI to application executable", "class"); cli.addOption("reduces", false, "number of reduces", "num"); cli.addOption("jobconf", false, "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val"); cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean"); Parser parser = cli.createParser(); try { GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args); CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs()); JobConf job = new JobConf(getConf()); if (results.hasOption("input")) { FileInputFormat.setInputPaths(job, results.getOptionValue("input")); } if (results.hasOption("output")) { FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output"))); } if (results.hasOption("jar")) { job.setJar(results.getOptionValue("jar")); } if (results.hasOption("inputformat")) { setIsJavaRecordReader(job, true); job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class)); } if (results.hasOption("javareader")) { setIsJavaRecordReader(job, true); } if (results.hasOption("map")) { setIsJavaMapper(job, true); job.setMapperClass(getClass(results, "map", job, Mapper.class)); } if (results.hasOption("partitioner")) { job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class)); } if (results.hasOption("reduce")) { setIsJavaReducer(job, true); job.setReducerClass(getClass(results, "reduce", job, Reducer.class)); } if (results.hasOption("reduces")) { job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces"))); } if (results.hasOption("writer")) { setIsJavaRecordWriter(job, true); job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class)); } if (results.hasOption("lazyOutput")) { if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) { LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass()); } } if (results.hasOption("program")) { setExecutable(job, results.getOptionValue("program")); } if (results.hasOption("jobconf")) { LOG.warn("-jobconf option is deprecated, please use -D instead."); String options = results.getOptionValue("jobconf"); StringTokenizer tokenizer = new StringTokenizer(options, ","); while (tokenizer.hasMoreTokens()) { String keyVal = tokenizer.nextToken().trim(); String[] keyValSplit = keyVal.split("="); job.set(keyValSplit[0], keyValSplit[1]); } } // if they gave us a jar file, include it into the class path String jarFile = job.getJar(); if (jarFile != null) { final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() }; //FindBugs complains that creating a URLClassLoader should be //in a doPrivileged() block. ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() { public ClassLoader run() { return new URLClassLoader(urls); } }); job.setClassLoader(loader); } runJob(job); return 0; } catch (ParseException pe) { LOG.info("Error : " + pe); cli.printUsage(); return 1; } }
From source file:ivory.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); int collectionDocCnt = env.readCollectionDocumentCount(); LOG.info("PowerTool: BuildIPInvertedIndexDocSorted"); LOG.info(" - IndexPath: " + indexPath); LOG.info(" - CollectionName: " + collectionName); LOG.info(" - CollectionDocumentCount: " + collectionDocCnt); LOG.info(" - NumMapTasks: " + mapTasks); LOG.info(" - NumReduceTasks: " + reduceTasks); LOG.info(" - MinSplitSize: " + minSplitSize); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }//from w w w .j a va2s . co m Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setJobName("BuildIPInvertedIndex:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, postingsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(TermPositions.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PostingsListDocSortedPositional.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }
From source file:mapreduce.BigramCount.java
License:Apache License
/** * Runs this tool./*from w w w.j a v a 2s. co m*/ */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String inputPath = args[0]; String outputPath = args[1]; int mapTasks = 1;//Integer.parseInt(args[2]); int reduceTasks = 1;//Integer.parseInt(args[3]); sLogger.info("Tool: BigramCount"); sLogger.info(" - input path: " + inputPath); sLogger.info(" - output path: " + outputPath); sLogger.info(" - number of mappers: " + mapTasks); sLogger.info(" - number of reducers: " + reduceTasks); JobConf conf = new JobConf(BigramCount.class); conf.setJobName("BigramCount"); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); /** * Note that these must match the Class arguments given in the mapper */ conf.setOutputKeyClass(WordPair.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(MyMapper.class); conf.setPartitionerClass(MyPartitioner.class); conf.setCombinerClass(MyReducer.class); conf.setReducerClass(MyReducer.class); // Delete the output directory if it exists already Path outputDir = new Path(outputPath); FileSystem.get(outputDir.toUri(), conf).delete(outputDir, true); long startTime = System.currentTimeMillis(); JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }