List of usage examples for org.apache.hadoop.mapreduce Job setPartitionerClass
public void setPartitionerClass(Class<? extends Partitioner> cls) throws IllegalStateException
From source file:it.polito.dbdmg.searum.ARM.java
License:Apache License
/** * Run the rule aggregator job over mined rules. * // w ww . j a v a 2 s.c o m * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void startRuleAggregating(Parameters params, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(OUTPUT), RULES); Job job = new Job(conf, "Rule aggregator driver running over input: " + input); job.setJarByClass(ARM.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), RULESBYCONCLUSION); FileOutputFormat.setOutputPath(job, outPath); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(RuleAggregatorMapper.class); job.setReducerClass(RuleAggregatorReducer.class); job.setPartitionerClass(RulePartitionerByConclusion.class); job.setSortComparatorClass(RulesWritableComparator.class); job.setGroupingComparatorClass(RulesGroupingWritableComparator.class); HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:ivory.core.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0); int minSplitSize = conf.getInt(Constants.MinSplitSize, 0); int collectionDocCnt = env.readCollectionDocumentCount(); String postingsType = conf.get(Constants.PostingsListsType, ivory.core.data.index.PostingsListDocSortedPositional.class.getCanonicalName()); @SuppressWarnings("unchecked") Class<? extends PostingsList> postingsClass = (Class<? extends PostingsList>) Class.forName(postingsType); LOG.info("PowerTool: " + BuildIPInvertedIndexDocSorted.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.CollectionDocumentCount, collectionDocCnt)); LOG.info(String.format(" - %s: %s", Constants.PostingsListsType, postingsClass.getCanonicalName())); LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks)); LOG.info(String.format(" - %s: %s", Constants.MinSplitSize, minSplitSize)); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }//from w ww. j a v a 2s . c om Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setInt(Constants.CollectionDocumentCount, collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); Job job = new Job(conf, BuildIPInvertedIndexDocSorted.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildIPInvertedIndexDocSorted.class); job.setNumReduceTasks(reduceTasks); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, postingsPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(PairOfInts.class); job.setMapOutputValueClass(TermPositions.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(postingsClass); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType(postingsClass.getCanonicalName()); return 0; }
From source file:kogiri.mapreduce.libra.kmersimilarity_r.KmerSimilarityReduce.java
License:Open Source License
private int runJob(LibraConfig lConfig) throws Exception { // check config validateLibraConfig(lConfig);/*from ww w. j a v a2s .c o m*/ // configuration Configuration conf = this.getConf(); Job job = new Job(conf, "Kogiri Libra - Computing similarity between samples"); conf = job.getConfiguration(); // set user configuration lConfig.getClusterConfiguration().configureTo(conf); lConfig.saveTo(conf); job.setJarByClass(KmerSimilarityReduce.class); // Mapper job.setMapperClass(KmerSimilarityMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(CompressedSequenceWritable.class); job.setMapOutputValueClass(CompressedIntArrayWritable.class); // Partitioner job.setPartitionerClass(KmerSimilarityPartitioner.class); // Reducer job.setReducerClass(KmerSimilarityReducer.class); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Inputs Path[] kmerIndexFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, lConfig.getKmerIndexPath()); List<Path> indexPartFileArray = new ArrayList<Path>(); for (Path kmerIndexFile : kmerIndexFiles) { Path[] inputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, kmerIndexFile); for (Path indexPartFile : inputKmerIndexPartFiles) { Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf, indexPartFile); for (Path kmerIndexPartDataFile : kmerIndexPartDataFiles) { indexPartFileArray.add(kmerIndexPartDataFile); } } } SequenceFileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(indexPartFileArray.toArray(new Path[0]))); LOG.info("Input kmer index files : " + kmerIndexFiles.length); for (Path inputFile : kmerIndexFiles) { LOG.info("> " + inputFile.toString()); } int kmerSize = 0; for (Path inputFile : kmerIndexFiles) { // check kmerSize int myKmerSize = KmerIndexHelper.getKmerSize(inputFile); if (kmerSize == 0) { kmerSize = myKmerSize; } else { if (kmerSize != myKmerSize) { throw new Exception("kmer size must be the same over all given kmer indices"); } } } KmerMatchFileMapping fileMapping = new KmerMatchFileMapping(); for (Path kmerIndexFile : kmerIndexFiles) { String fastaFilename = KmerIndexHelper.getFastaFileName(kmerIndexFile.getName()); fileMapping.addFastaFile(fastaFilename); } fileMapping.saveTo(conf); int MRNodes = MapReduceClusterHelper.getNodeNum(conf); LOG.info("MapReduce nodes detected : " + MRNodes); FileOutputFormat.setOutputPath(job, new Path(lConfig.getOutputPath())); job.setOutputFormatClass(TextOutputFormat.class); // Reducer // Use many reducers int reducersPerNode = lConfig.getClusterConfiguration().getMachineCores() / 2; if (reducersPerNode < 1) { reducersPerNode = 1; } int reducers = lConfig.getClusterConfiguration().getMachineNum() * (lConfig.getClusterConfiguration().getMachineCores() / 2); LOG.info("Reducers : " + reducers); job.setNumReduceTasks(reducers); // Execute job and return status boolean result = job.waitForCompletion(true); // commit results if (result) { commit(new Path(lConfig.getOutputPath()), conf); Path tableFilePath = new Path(lConfig.getOutputPath(), KmerSimilarityHelper.makeKmerSimilarityTableFileName()); FileSystem fs = tableFilePath.getFileSystem(conf); fileMapping.saveTo(fs, tableFilePath); // combine results sumScores(new Path(lConfig.getOutputPath()), conf); } // report if (lConfig.getReportPath() != null && !lConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(job); report.writeTo(lConfig.getReportPath()); } return result ? 0 : 1; }
From source file:kogiri.mapreduce.preprocess.indexing.stage2.KmerIndexBuilder.java
License:Open Source License
private int runJob(PreprocessorConfig ppConfig) throws Exception { // check config validatePreprocessorConfig(ppConfig); // configuration Configuration conf = this.getConf(); // set user configuration ppConfig.getClusterConfiguration().configureTo(conf); ppConfig.saveTo(conf);/* w ww .j a va 2 s .c o m*/ Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath()); boolean job_result = true; List<Job> jobs = new ArrayList<Job>(); for (int round = 0; round < inputFiles.length; round++) { Path roundInputFile = inputFiles[round]; String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round; Job job = new Job(conf, "Kogiri Preprocessor - Building Kmer Indices (" + round + " of " + inputFiles.length + ")"); job.setJarByClass(KmerIndexBuilder.class); // Mapper job.setMapperClass(KmerIndexBuilderMapper.class); job.setInputFormatClass(FastaReadInputFormat.class); job.setMapOutputKeyClass(CompressedSequenceWritable.class); job.setMapOutputValueClass(CompressedIntArrayWritable.class); // Combiner job.setCombinerClass(KmerIndexBuilderCombiner.class); // Partitioner job.setPartitionerClass(KmerIndexBuilderPartitioner.class); // Reducer job.setReducerClass(KmerIndexBuilderReducer.class); // Specify key / value job.setOutputKeyClass(CompressedSequenceWritable.class); job.setOutputValueClass(CompressedIntArrayWritable.class); // Inputs FileInputFormat.addInputPaths(job, roundInputFile.toString()); LOG.info("Input file : "); LOG.info("> " + roundInputFile.toString()); String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName()); Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName); KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath); FileOutputFormat.setOutputPath(job, new Path(roundOutputPath)); job.setOutputFormatClass(MapFileOutputFormat.class); // Use many reducers int reducersPerNode = ppConfig.getClusterConfiguration().getMachineCores() / 2; if (reducersPerNode < 1) { reducersPerNode = 1; } int reducers = ppConfig.getClusterConfiguration().getMachineNum() * (ppConfig.getClusterConfiguration().getMachineCores() / 2); LOG.info("Reducers : " + reducers); job.setNumReduceTasks(reducers); // Execute job and return status boolean result = job.waitForCompletion(true); jobs.add(job); // commit results if (result) { commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath), new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize()); // create index of index createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(), ppConfig.getKmerSize()); } if (!result) { LOG.error("job failed at round " + round + " of " + inputFiles.length); job_result = false; break; } } // report if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(jobs); report.writeTo(ppConfig.getReportPath()); } return job_result ? 0 : 1; }
From source file:kogiri.mapreduce.readfrequency.modecount.ModeCounter.java
License:Open Source License
private int runJob(ReadFrequencyCounterConfig rfConfig) throws Exception { // check config validateReadFrequencyCounterConfig(rfConfig); // configuration Configuration conf = this.getConf(); // set user configuration rfConfig.getClusterConfiguration().configureTo(conf); rfConfig.saveTo(conf);/*w ww . j a v a 2 s . c o m*/ // table file Path tableFilePath = new Path(rfConfig.getKmerMatchPath(), KmerMatchHelper.makeKmerMatchTableFileName()); FileSystem fs = tableFilePath.getFileSystem(conf); KmerMatchFileMapping fileMapping = KmerMatchFileMapping.createInstance(fs, tableFilePath); Path[] inputFiles = KmerMatchHelper.getAllKmerMatchResultFilePath(conf, rfConfig.getKmerMatchPath()); // Register named outputs NamedOutputs namedOutputs = new NamedOutputs(); for (int i = 0; i < fileMapping.getSize(); i++) { String fastaFileName = fileMapping.getFastaFileFromID(i); namedOutputs.add(fastaFileName); } namedOutputs.saveTo(conf); boolean job_result = true; List<Job> jobs = new ArrayList<Job>(); for (int round = 0; round < fileMapping.getSize(); round++) { String roundOutputPath = rfConfig.getReadFrequencyPath() + "_round" + round; Job job = new Job(conf, "Kogiri Preprocessor - Computing Mode of Kmer Frequency (" + round + " of " + fileMapping.getSize() + ")"); job.setJarByClass(ModeCounter.class); // Mapper job.setMapperClass(ModeCounterMapper.class); job.setInputFormatClass(TextInputFormat.class); job.setMapOutputKeyClass(MultiFileIntWritable.class); job.setMapOutputValueClass(CompressedIntArrayWritable.class); // Combiner job.setCombinerClass(ModeCounterCombiner.class); // Partitioner job.setPartitionerClass(ModeCounterPartitioner.class); // Reducer job.setReducerClass(ModeCounterReducer.class); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Inputs FileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(inputFiles)); ModeCounterConfig modeCounterConfig = new ModeCounterConfig(); modeCounterConfig.setMasterFileID(round); modeCounterConfig.saveTo(job.getConfiguration()); FileOutputFormat.setOutputPath(job, new Path(roundOutputPath)); job.setOutputFormatClass(TextOutputFormat.class); for (NamedOutputRecord namedOutput : namedOutputs.getRecord()) { MultipleOutputs.addNamedOutput(job, namedOutput.getIdentifier(), TextOutputFormat.class, Text.class, Text.class); } // Execute job and return status boolean result = job.waitForCompletion(true); jobs.add(job); // commit results if (result) { commitRoundOutputFiles(new Path(roundOutputPath), new Path(rfConfig.getReadFrequencyPath()), job.getConfiguration(), namedOutputs, round); } if (!result) { LOG.error("job failed at round " + round + " of " + fileMapping.getSize()); job_result = false; break; } } // report if (rfConfig.getReportPath() != null && !rfConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(jobs); report.writeTo(rfConfig.getReportPath()); } return job_result ? 0 : 1; }
From source file:layer.AutoCoder.java
License:Apache License
/** * Runs this tool.//w w w . j a v a2 s .co m */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of reducers") .create(NUM_REDUCERS)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath0 = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); int reduceTasks = cmdline.hasOption(NUM_REDUCERS) ? Integer.parseInt(cmdline.getOptionValue(NUM_REDUCERS)) : 1; LOG.info("Tool: " + AutoCoder.class.getSimpleName()); LOG.info(" - input path: " + inputPath0); LOG.info(" - output path: " + outputPath); LOG.info(" - number of reducers: " + reduceTasks); Configuration conf = getConf(); initialParameters(conf); for (int iterations = 1; iterations < GlobalUtil.NUM_LAYER + 1; iterations++) { LOG.info("** Layer: " + iterations); try { Job job = Job.getInstance(conf); job.setJobName(AutoCoder.class.getSimpleName()); job.setJarByClass(AutoCoder.class); // set the path of the information of k clusters in this iteration job.getConfiguration().set("sidepath", inputPath0 + "/side_output"); job.getConfiguration().setInt("layer_ind", iterations); job.setNumReduceTasks(reduceTasks); String inputPath = inputPath0 + "/train"; dataShuffle(); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(ModelNode.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ModelNode.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setPartitionerClass(MyPartitioner.class); // Delete the output directory if it exists already. Path outputDir = new Path(outputPath); FileSystem.get(getConf()).delete(outputDir, true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); prepareNextIteration(inputPath0, outputPath, iterations, conf, reduceTasks); } catch (Exception exp) { exp.printStackTrace(); } } return 0; }
From source file:ldbc.snb.datagen.hadoop.HadoopPersonActivityGenerator.java
public void run(String inputFileName) throws AssertionError, Exception { FileSystem fs = FileSystem.get(conf); System.out.println("RANKING"); String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked"; HadoopFileRanker hadoopFileRanker = new HadoopFileRanker(conf, TupleKey.class, Person.class, null); hadoopFileRanker.run(inputFileName, rankedFileName); System.out.println("GENERATING"); int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads")); Job job = Job.getInstance(conf, "Person Activity Generator/Serializer"); job.setMapOutputKeyClass(BlockKey.class); job.setMapOutputValueClass(Person.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Person.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(HadoopPersonActivityGeneratorReducer.class); job.setNumReduceTasks(numThreads);/*from w w w. j a v a2 s. co m*/ job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setSortComparatorClass(BlockKeyComparator.class); job.setGroupingComparatorClass(BlockKeyGroupComparator.class); job.setPartitionerClass(HadoopBlockPartitioner.class); /** PROFILING OPTIONS **/ //job.setProfileEnabled(true); //job.setProfileParams("-agentlib:hprof=cpu=samples,heap=sites,depth=4,thread=y,format=b,file=%s"); //job.setProfileTaskRange(true,"0-1"); //job.setProfileTaskRange(false,"0-1"); /****/ FileInputFormat.setInputPaths(job, new Path(rankedFileName)); FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux")); long start = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { throw new Exception(); } } catch (AssertionError e) { throw e; } System.out.println("Real time to generate activity: " + (System.currentTimeMillis() - start) / 1000.0f); try { fs.delete(new Path(rankedFileName), true); fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"), true); } catch (IOException e) { System.err.println(e.getMessage()); e.printStackTrace(); } }
From source file:ldbc.socialnet.dbgen.generator.MRGenerateUsers.java
License:Open Source License
public int runGenerateJob(Configuration conf) throws Exception { FileSystem fs = FileSystem.get(conf); String hadoopDir = new String(conf.get("outputDir") + "/hadoop"); String socialNetDir = new String(conf.get("outputDir") + "/social_network"); int numThreads = Integer.parseInt(conf.get("numThreads")); System.out.println("NUMBER OF THREADS " + numThreads); /// --------- Execute Jobs ------ long start = System.currentTimeMillis(); /// --------------- First job Generating users---------------- printProgress("Starting: Person generation"); conf.set("pass", Integer.toString(0)); Job job = new Job(conf, "SIB Generate Users & 1st Dimension"); job.setMapOutputKeyClass(TupleKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(GenerateUsersMapper.class); job.setMapperClass(GenerateUsersMapper.class); job.setNumReduceTasks(numThreads);/*from w w w . j ava 2 s. c o m*/ job.setInputFormatClass(NLineInputFormat.class); conf.setInt("mapred.line.input.format.linespermap", 1); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir) + "/mrInputFile"); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib")); job.waitForCompletion(true); /// --------------- Sorting by first dimension ---------------- printProgress("Starting: Sorting by first dimension"); HadoopFileRanker fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib", hadoopDir + "/sibSorting"); fs.delete(new Path(hadoopDir + "/sib"), true); /// --------------- job Generating First dimension Friendships ---------------- printProgress("Starting: Friendship generation 1."); conf.set("pass", Integer.toString(0)); conf.set("dimension", Integer.toString(1)); job = new Job(conf, "SIB Generate Friendship - Interest"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib2")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting"), true); /// --------------- Sorting phase 2 ---------------- printProgress("Starting: Sorting by second dimension"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib2", hadoopDir + "/sibSorting2"); fs.delete(new Path(hadoopDir + "/sib2"), true); /// --------------- Second job Generating Friendships ---------------- printProgress("Starting: Friendship generation 2."); conf.set("pass", Integer.toString(1)); conf.set("dimension", Integer.toString(2)); job = new Job(conf, "SIB Generate Friendship - Interest"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting2")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib3")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting2"), true); /// --------------- Sorting phase 3-------------- printProgress("Starting: Sorting by third dimension"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib3", hadoopDir + "/sibSorting3"); fs.delete(new Path(hadoopDir + "/sib3"), true); /// --------------- Third job Generating Friendships---------------- printProgress("Starting: Friendship generation 3."); conf.set("pass", Integer.toString(2)); conf.set("dimension", Integer.toString(2)); job = new Job(conf, "SIB Generate Friendship - Random"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting3")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib4")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting3"), true); /// --------------- Sorting phase 3-------------- printProgress("Starting: Sorting by third dimension (for activity generation)"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib4", hadoopDir + "/sibSorting4"); fs.delete(new Path(hadoopDir + "/sib4"), true); /// --------------- Fourth job: Serialize static network ---------------- printProgress("Starting: Generating person activity"); job = new Job(conf, "Generate user activity"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(UserActivityReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting4")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib5")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sib5"), true); int numEvents = 0; long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; if (conf.getBoolean("updateStreams", false)) { for (int i = 0; i < numThreads; ++i) { int numPartitions = conf.getInt("numUpdatePartitions", 1); for (int j = 0; j < numPartitions; ++j) { /// --------------- Fifth job: Sort update streams ---------------- conf.setInt("mapred.line.input.format.linespermap", 1000000); conf.setInt("reducerId", i); conf.setInt("partitionId", j); conf.set("streamType", "forum"); Job jobForum = new Job(conf, "Soring update streams " + j + " of reducer " + i); jobForum.setMapOutputKeyClass(LongWritable.class); jobForum.setMapOutputValueClass(Text.class); jobForum.setOutputKeyClass(LongWritable.class); jobForum.setOutputValueClass(Text.class); jobForum.setJarByClass(UpdateEventMapper.class); jobForum.setMapperClass(UpdateEventMapper.class); jobForum.setReducerClass(UpdateEventReducer.class); jobForum.setNumReduceTasks(1); jobForum.setInputFormatClass(SequenceFileInputFormat.class); jobForum.setOutputFormatClass(SequenceFileOutputFormat.class); jobForum.setPartitionerClass(UpdateEventPartitioner.class); FileInputFormat.addInputPath(jobForum, new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum")); FileOutputFormat.setOutputPath(jobForum, new Path(hadoopDir + "/sibEnd")); printProgress("Starting: Sorting update streams"); jobForum.waitForCompletion(true); fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"), false); fs.delete(new Path(hadoopDir + "/sibEnd"), true); conf.setInt("mapred.line.input.format.linespermap", 1000000); conf.setInt("reducerId", i); conf.setInt("partitionId", j); conf.set("streamType", "person"); Job jobPerson = new Job(conf, "Soring update streams " + j + " of reducer " + i); jobPerson.setMapOutputKeyClass(LongWritable.class); jobPerson.setMapOutputValueClass(Text.class); jobPerson.setOutputKeyClass(LongWritable.class); jobPerson.setOutputValueClass(Text.class); jobPerson.setJarByClass(UpdateEventMapper.class); jobPerson.setMapperClass(UpdateEventMapper.class); jobPerson.setReducerClass(UpdateEventReducer.class); jobPerson.setNumReduceTasks(1); jobPerson.setInputFormatClass(SequenceFileInputFormat.class); jobPerson.setOutputFormatClass(SequenceFileOutputFormat.class); jobPerson.setPartitionerClass(UpdateEventPartitioner.class); FileInputFormat.addInputPath(jobPerson, new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person")); FileOutputFormat.setOutputPath(jobPerson, new Path(hadoopDir + "/sibEnd")); printProgress("Starting: Sorting update streams"); jobPerson.waitForCompletion(true); fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"), false); fs.delete(new Path(hadoopDir + "/sibEnd"), true); if (conf.getBoolean("updateStreams", false)) { Properties properties = new Properties(); FSDataInputStream file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_person.properties")); properties.load(file); if (properties.getProperty("min_write_event_start_time") != null) { Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time")); min = auxMin < min ? auxMin : min; Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time")); max = auxMax > max ? auxMax : max; numEvents += Long.parseLong(properties.getProperty("num_events")); } file.close(); file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_forum.properties")); properties.load(file); if (properties.getProperty("min_write_event_start_time") != null) { Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time")); min = auxMin < min ? auxMin : min; Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time")); max = auxMax > max ? auxMax : max; numEvents += Long.parseLong(properties.getProperty("num_events")); } file.close(); fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_person.properties"), true); fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_forum.properties"), true); } } } if (conf.getBoolean("updateStreams", false)) { OutputStream output = fs .create(new Path(conf.get("outputDir") + "/social_network/updateStream.properties")); output.write(new String("ldbc.snb.interactive.gct_delta_duration:" + conf.get("deltaTime") + "\n") .getBytes()); output.write( new String("ldbc.snb.interactive.min_write_event_start_time:" + min + "\n").getBytes()); output.write( new String("ldbc.snb.interactive.max_write_event_start_time:" + max + "\n").getBytes()); output.write(new String("ldbc.snb.interactive.update_interleave:" + (max - min) / numEvents + "\n") .getBytes()); output.write(new String("ldbc.snb.interactive.num_events:" + numEvents).getBytes()); output.close(); } } /// --------------- Sixth job: Materialize the friends lists ---------------- /* Job job6 = new Job(conf,"Dump the friends lists"); job6.setMapOutputKeyClass(ComposedKey.class); job6.setMapOutputValueClass(ReducedUserProfile.class); job6.setOutputKeyClass(ComposedKey.class); job6.setOutputValueClass(ReducedUserProfile.class); job6.setJarByClass(HadoopBlockMapper.class); job6.setMapperClass(HadoopBlockMapper.class); job6.setReducerClass(FriendListOutputReducer.class); job6.setNumReduceTasks(numThreads); job6.setInputFormatClass(SequenceFileInputFormat.class); job6.setOutputFormatClass(SequenceFileOutputFormat.class); job6.setPartitionerClass(HadoopBlockPartitioner.class); job6.setSortComparatorClass(ComposedKeyComparator.class); job6.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job6, new Path(hadoopDir + "/sibSorting4")); FileOutputFormat.setOutputPath(job6, new Path(hadoopDir + "/job6") ); printProgress("Starting: Materialize friends for substitution parameters"); int resMaterializeFriends = job6.waitForCompletion(true) ? 0 : 1; fs.delete(new Path(hadoopDir + "/sibSorting3"),true); */ long end = System.currentTimeMillis(); System.out.println(((end - start) / 1000) + " total seconds"); for (int i = 0; i < numThreads; ++i) { fs.copyToLocalFile(new Path(socialNetDir + "/m" + i + "factors.txt"), new Path("./")); fs.copyToLocalFile(new Path(socialNetDir + "/m0friendList" + i + ".csv"), new Path("./")); } return 0; }
From source file:libra.core.kmersimilarity_r.KmerSimilarityReduce.java
License:Apache License
private int runJob(CoreConfig cConfig) throws Exception { // check config validateCoreConfig(cConfig);/*w ww. j av a 2s . co m*/ // configuration Configuration conf = this.getConf(); Job job = new Job(conf, "Libra Core - Computing similarity between samples"); conf = job.getConfiguration(); // set user configuration cConfig.saveTo(conf); job.setJarByClass(KmerSimilarityReduce.class); // Mapper job.setMapperClass(KmerSimilarityMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(CompressedSequenceWritable.class); job.setMapOutputValueClass(CompressedIntArrayWritable.class); // Partitioner job.setPartitionerClass(KmerSimilarityPartitioner.class); // Reducer job.setReducerClass(KmerSimilarityReducer.class); // Specify key / value job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Inputs Path[] kmerIndexFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, cConfig.getKmerIndexPath()); List<Path> indexPartFileArray = new ArrayList<Path>(); for (Path kmerIndexFile : kmerIndexFiles) { Path[] inputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, kmerIndexFile); for (Path indexPartFile : inputKmerIndexPartFiles) { Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf, indexPartFile); for (Path kmerIndexPartDataFile : kmerIndexPartDataFiles) { indexPartFileArray.add(kmerIndexPartDataFile); } } } SequenceFileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(indexPartFileArray.toArray(new Path[0]))); LOG.info("Input kmer index files : " + kmerIndexFiles.length); for (Path inputFile : kmerIndexFiles) { LOG.info("> " + inputFile.toString()); } int kmerSize = 0; for (Path inputFile : kmerIndexFiles) { // check kmerSize int myKmerSize = KmerIndexHelper.getKmerSize(inputFile); if (kmerSize == 0) { kmerSize = myKmerSize; } else { if (kmerSize != myKmerSize) { throw new Exception("kmer size must be the same over all given kmer indices"); } } } KmerMatchFileMapping fileMapping = new KmerMatchFileMapping(); for (Path kmerIndexFile : kmerIndexFiles) { String fastaFilename = KmerIndexHelper.getFastaFileName(kmerIndexFile.getName()); fileMapping.addFastaFile(fastaFilename); } fileMapping.saveTo(conf); FileOutputFormat.setOutputPath(job, new Path(cConfig.getOutputPath())); job.setOutputFormatClass(TextOutputFormat.class); // Reducer // Use many reducers int reducers = conf.getInt("mapred.reduce.tasks", 0); if (reducers <= 0) { int MRNodes = MapReduceClusterHelper.getNodeNum(conf); reducers = MRNodes * 2; job.setNumReduceTasks(reducers); } LOG.info("Reducers : " + reducers); // Execute job and return status boolean result = job.waitForCompletion(true); // commit results if (result) { commit(new Path(cConfig.getOutputPath()), conf); Path tableFilePath = new Path(cConfig.getOutputPath(), KmerSimilarityHelper.makeKmerSimilarityTableFileName()); FileSystem fs = tableFilePath.getFileSystem(conf); fileMapping.saveTo(fs, tableFilePath); // combine results sumScores(new Path(cConfig.getOutputPath()), conf); } // report if (cConfig.getReportPath() != null && !cConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(job); report.writeTo(cConfig.getReportPath()); } return result ? 0 : 1; }
From source file:libra.preprocess.stage2.KmerIndexBuilder.java
License:Apache License
private int runJob(PreprocessorConfig ppConfig) throws Exception { // check config validatePreprocessorConfig(ppConfig); // configuration Configuration conf = this.getConf(); // set user configuration ppConfig.saveTo(conf);//from www. ja v a2 s . c o m Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath()); boolean job_result = true; List<Job> jobs = new ArrayList<Job>(); for (int round = 0; round < inputFiles.length; round++) { Path roundInputFile = inputFiles[round]; String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round; Job job = new Job(conf, "Libra Preprocessor - Building Kmer Indexes (" + round + " of " + inputFiles.length + ")"); job.setJarByClass(KmerIndexBuilder.class); // Mapper job.setMapperClass(KmerIndexBuilderMapper.class); FastaKmerInputFormat.setKmerSize(conf, ppConfig.getKmerSize()); job.setInputFormatClass(FastaKmerInputFormat.class); job.setMapOutputKeyClass(CompressedSequenceWritable.class); job.setMapOutputValueClass(IntWritable.class); // Combiner job.setCombinerClass(KmerIndexBuilderCombiner.class); // Partitioner job.setPartitionerClass(KmerIndexBuilderPartitioner.class); // Reducer job.setReducerClass(KmerIndexBuilderReducer.class); // Specify key / value job.setOutputKeyClass(CompressedSequenceWritable.class); job.setOutputValueClass(IntWritable.class); // Inputs FileInputFormat.addInputPaths(job, roundInputFile.toString()); LOG.info("Input file : "); LOG.info("> " + roundInputFile.toString()); String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName()); Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName); KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath); FileOutputFormat.setOutputPath(job, new Path(roundOutputPath)); job.setOutputFormatClass(MapFileOutputFormat.class); // Use many reducers int reducers = conf.getInt("mapred.reduce.tasks", 0); if (reducers <= 0) { int MRNodes = MapReduceClusterHelper.getNodeNum(conf); reducers = MRNodes * 2; job.setNumReduceTasks(reducers); } LOG.info("Reducers : " + reducers); // Execute job and return status boolean result = job.waitForCompletion(true); jobs.add(job); // commit results if (result) { commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath), new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize()); // create index of index createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(), ppConfig.getKmerSize()); // create statistics of index createStatisticsOfIndex(new Path(ppConfig.getKmerStatisticsPath()), roundInputFile, job.getConfiguration(), job.getCounters(), ppConfig.getKmerSize()); } if (!result) { LOG.error("job failed at round " + round + " of " + inputFiles.length); job_result = false; break; } } // report if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) { Report report = new Report(); report.addJob(jobs); report.writeTo(ppConfig.getReportPath()); } return job_result ? 0 : 1; }