List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass
public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:io.apigee.lembos.mapreduce.LembosMapReduceRunner.java
License:Apache License
/** * Returns a properly configured, ready to run Hadoop {@link Job}. * * @param args the command line arguments as supported by {@link GenericOptionsParser} * * @return the configured job//from w w w. j a v a 2 s . c om * * @throws IOException if there is a problem creating the job * @throws ExecutionException if there is an issue running the Node.js module * @throws InterruptedException if the execution of the Node.js module gets interrupted * @throws NodeException if there is an issue with the Node.js module */ public Job initJob(final String[] args) throws ExecutionException, InterruptedException, IOException, NodeException { final GenericOptionsParser gop = new GenericOptionsParser(args); // If ran from ToolRunner, conf should already be set but if not, set it manually if (conf == null) { setConf(gop.getConfiguration()); } // Load the Hadoop FS URL handler RunnerUtils.loadFsUrlStreamHandler(getConf()); // Persist the non-Runner CLI arguments conf.setStrings(LembosConstants.MR_MODULE_ARGS, gop.getRemainingArgs()); // Package the Node.js module and prepare it to be submitted with the Job RunnerUtils.prepareModuleForJob(conf); // Add "-libjars" to the current ClassLoader if necessary RunnerUtils.addLibJarsToClassLoader(conf); // Create Node.js environment for local use mrEnv = LembosMapReduceEnvironment.fromConf(conf); if (JavaScriptUtils.isDefined(mrEnv.getConfiguration())) { for (final Map.Entry<Object, Object> propertyEntry : mrEnv.getConfiguration().entrySet()) { final String key = propertyEntry.getKey().toString(); final Writable value = ConversionUtils.jsToWritable(propertyEntry.getValue(), mrEnv.getModule()); // Do not set these as we'll be setting them later from values we were passed from the CLI if (key.equals(LembosConstants.MR_MODULE_NAME)) { continue; } if (value instanceof BooleanWritable) { conf.setBoolean(key, ((BooleanWritable) value).get()); } else if (value instanceof DoubleWritable || value instanceof FloatWritable) { conf.setFloat(key, Float.valueOf(value.toString())); } else if (value instanceof IntWritable) { conf.setInt(key, ((IntWritable) value).get()); } else if (value instanceof LongWritable) { conf.setLong(key, ((LongWritable) value).get()); } else if (value instanceof Text) { conf.set(key, value.toString()); } else { System.err.println("Cannot convert JavaScript (" + value.getClass().getName() + ") to Configuration, using String"); conf.set(key, value.toString()); } } } // Create Job final String jobName = "LembosMapReduceJob-" + mrEnv.getModuleName(); final Job job = new Job(conf, jobName); jobWrapper = JobWrap.getInstance(mrEnv.getRuntime(), job); if (JavaScriptUtils.isDefined(mrEnv.getJobSetupFunction())) { mrEnv.callFunctionSync(mrEnv.getJobSetupFunction(), new Object[] { jobWrapper }); } // Always set the mapper job.setMapperClass(LembosMapper.class); // Conditionally set the combiner if (JavaScriptUtils.isDefined(mrEnv.getCombineFunction())) { job.setCombinerClass(LembosCombiner.class); } // Conditionally set the group comparator if (JavaScriptUtils.isDefined(mrEnv.getGroupFunction())) { job.setGroupingComparatorClass(LembosGroupComparator.class); } // Conditionally set the partitioner if (JavaScriptUtils.isDefined(mrEnv.getPartitionFunction())) { job.setPartitionerClass(LembosPartitioner.class); } // Conditionally set the reducer if (JavaScriptUtils.isDefined(mrEnv.getReduceFunction())) { job.setReducerClass(LembosReducer.class); } else { job.setNumReduceTasks(0); } // Conditionally set the sort comparator if (JavaScriptUtils.isDefined(mrEnv.getSortFunction())) { job.setSortComparatorClass(LembosSortComparator.class); } // This could potentially be unsafe but for testing, we need to set this based on the path to the built JAR if (job.getJar() == null) { job.setJarByClass(LembosMapReduceRunner.class); } // MapReduce configuration reference: // // http://hadoop.apache.org/docs/stable/hadoop-mapreduce-client/hadoop-mapreduce-client-core/mapred-default.xml // org.apache.hadoop.mapreduce.MRConfig // org.apache.hadoop.mapreduce.MRJobConfig return job; }
From source file:io.bfscan.clueweb12.BuildDictionary.java
License:Apache License
/** * Runs this tool./*from w w w .jav a2s.c o m*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of terms").create(COUNT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(COUNT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); Configuration conf = getConf(); conf.set(HADOOP_OUTPUT_OPTION, output); conf.setInt(HADOOP_TERMS_COUNT_OPTION, Integer.parseInt(cmdline.getOptionValue(COUNT_OPTION))); conf.set("mapreduce.map.memory.mb", "4096"); conf.set("mapreduce.map.java.opts", "-Xmx4096m"); conf.set("mapreduce.reduce.memory.mb", "4096"); conf.set("mapreduce.reduce.java.opts", "-Xmx4096m"); Job job = Job.getInstance(conf); job.setJobName(BuildDictionary.class.getSimpleName() + ":" + input); job.setJarByClass(BuildDictionary.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class); job.setMapperClass(Mapper.class); job.setReducerClass(MyReducer.class); FileSystem.get(getConf()).delete(new Path(output), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ipldataanalysis.IPLDataAnalysis.java
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.out.printf(//from w w w. j a v a 2 s. c o m "Three parameters are required for Data Analysis for IPL- <input dir> <intermidiate dir> <output dir>\n"); return -1; } Job job = new Job(getConf(), "Job1"); job.setJarByClass(IPLDataAnalysis.class); FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); job.setMapperClass(DataAnalysisMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(DataAnalysisReducer.class); job.waitForCompletion(true); Job job2 = new Job(getConf(), "Job2"); job2.setJarByClass(IPLDataAnalysis.class); FileInputFormat.setInputPaths(job2, new Path(args[1] + "/part-r-00000")); FileOutputFormat.setOutputPath(job2, new Path(args[2])); job2.setMapperClass(DataAnalysisMapper2.class); job2.setMapOutputKeyClass(LongWritable.class); job2.setMapOutputValueClass(Text.class); job2.setSortComparatorClass(LongWritable.DecreasingComparator.class); job2.setReducerClass(DataAnalysisReducer3.class); boolean success = job2.waitForCompletion(true); return success ? 0 : 1; }
From source file:ir.ac.ut.snl.mrcd.StageThree.java
public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { Job job = new Job(); String input = args[0];//from ww w . ja va 2 s .c om String output = args[1]; FileInputFormat.addInputPath(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setJarByClass(StageThree.class); job.setJobName("Stage three"); job.setMapperClass(StageThreeMapper.class); job.setReducerClass(StageThreeReducer.class); // job.setOutputKeyClass(Text.class); // job.setOutputValueClass(DoubleWritable.class); job.setOutputKeyClass(DoubleWritable.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(KeyValueTextInputFormat.class); job.setSortComparatorClass(SortDoubleComparator.class); job.waitForCompletion(true); Scanner scanner = null; try { File file = new File("/home/arian/NetBeansProjects/bscthesis2/output/stagethree/part-r-00000"); FileReader fileReader = new FileReader(file); BufferedReader bufferedReader = new BufferedReader(fileReader); scanner = new Scanner(bufferedReader); } catch (Exception e) { System.out.println("NA NASHOD NASHOD NASHOD FILE BAZ NASHOD"); e.printStackTrace(); } PrintWriter printWriter = new PrintWriter("/home/arian/NetBeansProjects/bscthesis2/topkedgebetweenness", "UTF-8"); int k = 4; for (int i = 0; i < k; i++) { printWriter.write(scanner.nextLine()); // if (i != k - 1) printWriter.write('\n'); } printWriter.close(); scanner.close(); Path inFile = new Path("/home/arian/NetBeansProjects/bscthesis2/topkedgebetweenness"); Path outFile = new Path("/home/arian/myhadoop/NetBeansProjects/bscthesis2/topkedgebetweenness"); FileSystem fs = FileSystem.get(new Configuration()); FSDataInputStream in = fs.open(inFile); FSDataOutputStream out = fs.create(outFile); int bytesRead = 0; byte buffer[] = new byte[256]; while ((bytesRead = in.read(buffer)) > 0) { out.write(buffer, 0, bytesRead); } in.close(); out.close(); return 0; }
From source file:it.crs4.seal.demux.Demux.java
License:Open Source License
@Override public int run(String[] args) throws Exception { LOG.info("starting"); Configuration conf = getConf(); DemuxOptionParser parser = new DemuxOptionParser(); parser.parse(conf, args);//w w w . j ava2 s. c om conf.setBoolean(CONF_NO_INDEX_READS, parser.getNoIndexReads()); conf.setBoolean(CONF_SEPARATE_READS, parser.getSeparateReads()); LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks"); if (parser.getNoIndexReads()) LOG.info("Not expecting to find any index reads. Will demultiplex based only on lane."); // load sample sheet to fail early in case of problems DemuxUtils.loadSampleSheet(parser.getSampleSheetPath(), conf); // must be called before creating the job, since the job // *copies* the Configuration. distributeSampleSheet(parser.getSampleSheetPath()); // Create a Job using the processed conf Job job = new Job(getConf(), makeJobName(parser.getInputPaths().get(0))); job.setJarByClass(Demux.class); // input paths for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName("qseq"))); job.setMapperClass(Map.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(SequencedFragment.class); job.setPartitionerClass(SequenceIdLocationPartitioner.class); job.setGroupingComparatorClass(GroupByLocationComparator.class); job.setSortComparatorClass(TwoOneThreeSortComparator.class); job.setReducerClass(Red.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(SequencedFragment.class); // output job.setOutputFormatClass(DemuxOutputFormat.class); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); // Submit the job, then poll for progress until the job is complete boolean result = job.waitForCompletion(true); if (result) { LOG.info("done"); if (parser.getCreateLaneContent()) createLaneContentFiles(parser.getOutputPath(), parser.getSampleSheetPath()); return 0; } else { LOG.fatal(this.getClass().getName() + " failed!"); return 1; } }
From source file:it.polito.dbdmg.searum.ARM.java
License:Apache License
/** * Run the rule aggregator job over mined rules. * // w w w. j a va 2 s .co m * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void startRuleAggregating(Parameters params, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(OUTPUT), RULES); Job job = new Job(conf, "Rule aggregator driver running over input: " + input); job.setJarByClass(ARM.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), RULESBYCONCLUSION); FileOutputFormat.setOutputPath(job, outPath); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(RuleAggregatorMapper.class); job.setReducerClass(RuleAggregatorReducer.class); job.setPartitionerClass(RulePartitionerByConclusion.class); job.setSortComparatorClass(RulesWritableComparator.class); job.setGroupingComparatorClass(RulesGroupingWritableComparator.class); HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:ivory.core.preprocess.BuildDictionary.java
License:Apache License
public int runTool() throws Exception { Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get(Constants.IndexPath); String collectionName = conf.get(Constants.CollectionName); LOG.info("PowerTool: " + BuildDictionary.class.getCanonicalName()); LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName)); LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath)); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); if (!fs.exists(new Path(indexPath))) { LOG.error("index path doesn't existing: skipping!"); return 0; }/*w ww . j av a 2 s . com*/ if (fs.exists(new Path(env.getIndexTermsData())) && fs.exists(new Path(env.getIndexTermIdsData())) && fs.exists(new Path(env.getIndexTermIdMappingData())) && fs.exists(new Path(env.getDfByTermData())) && fs.exists(new Path(env.getCfByTermData())) && fs.exists(new Path(env.getDfByIntData())) && fs.exists(new Path(env.getCfByIntData()))) { LOG.info("term and term id data exist: skipping!"); return 0; } conf.setInt(Constants.CollectionTermCount, (int) env.readCollectionTermCount()); conf.set("mapred.child.java.opts", "-Xmx2048m"); Path tmpPath = new Path(env.getTempDirectory()); fs.delete(tmpPath, true); Job job = new Job(conf, BuildDictionary.class.getSimpleName() + ":" + collectionName); job.setJarByClass(BuildDictionary.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(env.getTermDfCfDirectory())); FileOutputFormat.setOutputPath(job, tmpPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class); job.setMapperClass(Mapper.class); job.setReducerClass(MyReducer.class); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); fs.delete(tmpPath, true); return 0; }
From source file:ldbc.snb.datagen.hadoop.HadoopPersonActivityGenerator.java
public void run(String inputFileName) throws AssertionError, Exception { FileSystem fs = FileSystem.get(conf); System.out.println("RANKING"); String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked"; HadoopFileRanker hadoopFileRanker = new HadoopFileRanker(conf, TupleKey.class, Person.class, null); hadoopFileRanker.run(inputFileName, rankedFileName); System.out.println("GENERATING"); int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads")); Job job = Job.getInstance(conf, "Person Activity Generator/Serializer"); job.setMapOutputKeyClass(BlockKey.class); job.setMapOutputValueClass(Person.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Person.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(HadoopPersonActivityGeneratorReducer.class); job.setNumReduceTasks(numThreads);// ww w . j a va 2 s .c o m job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setSortComparatorClass(BlockKeyComparator.class); job.setGroupingComparatorClass(BlockKeyGroupComparator.class); job.setPartitionerClass(HadoopBlockPartitioner.class); /** PROFILING OPTIONS **/ //job.setProfileEnabled(true); //job.setProfileParams("-agentlib:hprof=cpu=samples,heap=sites,depth=4,thread=y,format=b,file=%s"); //job.setProfileTaskRange(true,"0-1"); //job.setProfileTaskRange(false,"0-1"); /****/ FileInputFormat.setInputPaths(job, new Path(rankedFileName)); FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux")); long start = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { throw new Exception(); } } catch (AssertionError e) { throw e; } System.out.println("Real time to generate activity: " + (System.currentTimeMillis() - start) / 1000.0f); try { fs.delete(new Path(rankedFileName), true); fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"), true); } catch (IOException e) { System.err.println(e.getMessage()); e.printStackTrace(); } }
From source file:ldbc.socialnet.dbgen.generator.MRGenerateUsers.java
License:Open Source License
public int runGenerateJob(Configuration conf) throws Exception { FileSystem fs = FileSystem.get(conf); String hadoopDir = new String(conf.get("outputDir") + "/hadoop"); String socialNetDir = new String(conf.get("outputDir") + "/social_network"); int numThreads = Integer.parseInt(conf.get("numThreads")); System.out.println("NUMBER OF THREADS " + numThreads); /// --------- Execute Jobs ------ long start = System.currentTimeMillis(); /// --------------- First job Generating users---------------- printProgress("Starting: Person generation"); conf.set("pass", Integer.toString(0)); Job job = new Job(conf, "SIB Generate Users & 1st Dimension"); job.setMapOutputKeyClass(TupleKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(GenerateUsersMapper.class); job.setMapperClass(GenerateUsersMapper.class); job.setNumReduceTasks(numThreads);/* w w w . ja v a 2 s . com*/ job.setInputFormatClass(NLineInputFormat.class); conf.setInt("mapred.line.input.format.linespermap", 1); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir) + "/mrInputFile"); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib")); job.waitForCompletion(true); /// --------------- Sorting by first dimension ---------------- printProgress("Starting: Sorting by first dimension"); HadoopFileRanker fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib", hadoopDir + "/sibSorting"); fs.delete(new Path(hadoopDir + "/sib"), true); /// --------------- job Generating First dimension Friendships ---------------- printProgress("Starting: Friendship generation 1."); conf.set("pass", Integer.toString(0)); conf.set("dimension", Integer.toString(1)); job = new Job(conf, "SIB Generate Friendship - Interest"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib2")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting"), true); /// --------------- Sorting phase 2 ---------------- printProgress("Starting: Sorting by second dimension"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib2", hadoopDir + "/sibSorting2"); fs.delete(new Path(hadoopDir + "/sib2"), true); /// --------------- Second job Generating Friendships ---------------- printProgress("Starting: Friendship generation 2."); conf.set("pass", Integer.toString(1)); conf.set("dimension", Integer.toString(2)); job = new Job(conf, "SIB Generate Friendship - Interest"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting2")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib3")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting2"), true); /// --------------- Sorting phase 3-------------- printProgress("Starting: Sorting by third dimension"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib3", hadoopDir + "/sibSorting3"); fs.delete(new Path(hadoopDir + "/sib3"), true); /// --------------- Third job Generating Friendships---------------- printProgress("Starting: Friendship generation 3."); conf.set("pass", Integer.toString(2)); conf.set("dimension", Integer.toString(2)); job = new Job(conf, "SIB Generate Friendship - Random"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting3")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib4")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting3"), true); /// --------------- Sorting phase 3-------------- printProgress("Starting: Sorting by third dimension (for activity generation)"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib4", hadoopDir + "/sibSorting4"); fs.delete(new Path(hadoopDir + "/sib4"), true); /// --------------- Fourth job: Serialize static network ---------------- printProgress("Starting: Generating person activity"); job = new Job(conf, "Generate user activity"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(UserActivityReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting4")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib5")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sib5"), true); int numEvents = 0; long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; if (conf.getBoolean("updateStreams", false)) { for (int i = 0; i < numThreads; ++i) { int numPartitions = conf.getInt("numUpdatePartitions", 1); for (int j = 0; j < numPartitions; ++j) { /// --------------- Fifth job: Sort update streams ---------------- conf.setInt("mapred.line.input.format.linespermap", 1000000); conf.setInt("reducerId", i); conf.setInt("partitionId", j); conf.set("streamType", "forum"); Job jobForum = new Job(conf, "Soring update streams " + j + " of reducer " + i); jobForum.setMapOutputKeyClass(LongWritable.class); jobForum.setMapOutputValueClass(Text.class); jobForum.setOutputKeyClass(LongWritable.class); jobForum.setOutputValueClass(Text.class); jobForum.setJarByClass(UpdateEventMapper.class); jobForum.setMapperClass(UpdateEventMapper.class); jobForum.setReducerClass(UpdateEventReducer.class); jobForum.setNumReduceTasks(1); jobForum.setInputFormatClass(SequenceFileInputFormat.class); jobForum.setOutputFormatClass(SequenceFileOutputFormat.class); jobForum.setPartitionerClass(UpdateEventPartitioner.class); FileInputFormat.addInputPath(jobForum, new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum")); FileOutputFormat.setOutputPath(jobForum, new Path(hadoopDir + "/sibEnd")); printProgress("Starting: Sorting update streams"); jobForum.waitForCompletion(true); fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"), false); fs.delete(new Path(hadoopDir + "/sibEnd"), true); conf.setInt("mapred.line.input.format.linespermap", 1000000); conf.setInt("reducerId", i); conf.setInt("partitionId", j); conf.set("streamType", "person"); Job jobPerson = new Job(conf, "Soring update streams " + j + " of reducer " + i); jobPerson.setMapOutputKeyClass(LongWritable.class); jobPerson.setMapOutputValueClass(Text.class); jobPerson.setOutputKeyClass(LongWritable.class); jobPerson.setOutputValueClass(Text.class); jobPerson.setJarByClass(UpdateEventMapper.class); jobPerson.setMapperClass(UpdateEventMapper.class); jobPerson.setReducerClass(UpdateEventReducer.class); jobPerson.setNumReduceTasks(1); jobPerson.setInputFormatClass(SequenceFileInputFormat.class); jobPerson.setOutputFormatClass(SequenceFileOutputFormat.class); jobPerson.setPartitionerClass(UpdateEventPartitioner.class); FileInputFormat.addInputPath(jobPerson, new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person")); FileOutputFormat.setOutputPath(jobPerson, new Path(hadoopDir + "/sibEnd")); printProgress("Starting: Sorting update streams"); jobPerson.waitForCompletion(true); fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"), false); fs.delete(new Path(hadoopDir + "/sibEnd"), true); if (conf.getBoolean("updateStreams", false)) { Properties properties = new Properties(); FSDataInputStream file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_person.properties")); properties.load(file); if (properties.getProperty("min_write_event_start_time") != null) { Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time")); min = auxMin < min ? auxMin : min; Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time")); max = auxMax > max ? auxMax : max; numEvents += Long.parseLong(properties.getProperty("num_events")); } file.close(); file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_forum.properties")); properties.load(file); if (properties.getProperty("min_write_event_start_time") != null) { Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time")); min = auxMin < min ? auxMin : min; Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time")); max = auxMax > max ? auxMax : max; numEvents += Long.parseLong(properties.getProperty("num_events")); } file.close(); fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_person.properties"), true); fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_forum.properties"), true); } } } if (conf.getBoolean("updateStreams", false)) { OutputStream output = fs .create(new Path(conf.get("outputDir") + "/social_network/updateStream.properties")); output.write(new String("ldbc.snb.interactive.gct_delta_duration:" + conf.get("deltaTime") + "\n") .getBytes()); output.write( new String("ldbc.snb.interactive.min_write_event_start_time:" + min + "\n").getBytes()); output.write( new String("ldbc.snb.interactive.max_write_event_start_time:" + max + "\n").getBytes()); output.write(new String("ldbc.snb.interactive.update_interleave:" + (max - min) / numEvents + "\n") .getBytes()); output.write(new String("ldbc.snb.interactive.num_events:" + numEvents).getBytes()); output.close(); } } /// --------------- Sixth job: Materialize the friends lists ---------------- /* Job job6 = new Job(conf,"Dump the friends lists"); job6.setMapOutputKeyClass(ComposedKey.class); job6.setMapOutputValueClass(ReducedUserProfile.class); job6.setOutputKeyClass(ComposedKey.class); job6.setOutputValueClass(ReducedUserProfile.class); job6.setJarByClass(HadoopBlockMapper.class); job6.setMapperClass(HadoopBlockMapper.class); job6.setReducerClass(FriendListOutputReducer.class); job6.setNumReduceTasks(numThreads); job6.setInputFormatClass(SequenceFileInputFormat.class); job6.setOutputFormatClass(SequenceFileOutputFormat.class); job6.setPartitionerClass(HadoopBlockPartitioner.class); job6.setSortComparatorClass(ComposedKeyComparator.class); job6.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job6, new Path(hadoopDir + "/sibSorting4")); FileOutputFormat.setOutputPath(job6, new Path(hadoopDir + "/job6") ); printProgress("Starting: Materialize friends for substitution parameters"); int resMaterializeFriends = job6.waitForCompletion(true) ? 0 : 1; fs.delete(new Path(hadoopDir + "/sibSorting3"),true); */ long end = System.currentTimeMillis(); System.out.println(((end - start) / 1000) + " total seconds"); for (int i = 0; i < numThreads; ++i) { fs.copyToLocalFile(new Path(socialNetDir + "/m" + i + "factors.txt"), new Path("./")); fs.copyToLocalFile(new Path(socialNetDir + "/m0friendList" + i + ".csv"), new Path("./")); } return 0; }
From source file:mvm.rya.joinselect.mr.JoinSelectAggregate.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String inPath1 = conf.get(PROSPECTS_OUTPUTPATH); String inPath2 = conf.get(SPO_OUTPUTPATH); String auths = conf.get(AUTHS); String outPath = conf.get(OUTPUTPATH); assert inPath1 != null && inPath2 != null && outPath != null; Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis()); job.setJarByClass(this.getClass()); conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true); JoinSelectStatsUtil.initJoinMRJob(job, inPath1, inPath2, JoinSelectAggregateMapper.class, outPath, auths); job.setSortComparatorClass(JoinSelectSortComparator.class); job.setGroupingComparatorClass(JoinSelectGroupComparator.class); job.setPartitionerClass(JoinSelectPartitioner.class); job.setReducerClass(JoinReducer.class); job.setNumReduceTasks(32);// ww w. j av a 2s . com job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }