List of usage examples for org.apache.hadoop.mapreduce Job setGroupingComparatorClass
public void setGroupingComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:it.crs4.seal.prq.PairReadsQSeq.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); // defaults/*from w w w . j a v a 2s . c om*/ conf.set(PrqOptionParser.INPUT_FORMAT_CONF, PrqOptionParser.InputFormatDefault); // parse command line PrqOptionParser parser = new PrqOptionParser(); parser.parse(conf, args); Job job = new Job(conf, "PairReadsQSeq " + parser.getInputPaths().get(0)); job.setJarByClass(PairReadsQSeq.class); job.setInputFormatClass(FormatNameMap.getInputFormat(parser.getInputFormatName())); job.setOutputFormatClass(FormatNameMap.getOutputFormat(parser.getOutputFormatName("prq"))); job.setMapperClass(PrqMapper.class); job.setMapOutputKeyClass(SequenceId.class); job.setMapOutputValueClass(Text.class); job.setPartitionerClass(FirstPartitioner.class); job.setGroupingComparatorClass(GroupByLocationComparator.class); job.setReducerClass(PrqReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(ReadPair.class); for (Path p : parser.getInputPaths()) FileInputFormat.addInputPath(job, p); FileOutputFormat.setOutputPath(job, parser.getOutputPath()); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:it.polito.dbdmg.searum.ARM.java
License:Apache License
/** * Run the rule aggregator job over mined rules. * //from ww w . ja v a2s . co m * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static void startRuleAggregating(Parameters params, Configuration conf) throws IOException, ClassNotFoundException, InterruptedException { conf.set("mapred.compress.map.output", "true"); conf.set("mapred.output.compression.type", "BLOCK"); Path input = new Path(params.get(OUTPUT), RULES); Job job = new Job(conf, "Rule aggregator driver running over input: " + input); job.setJarByClass(ARM.class); FileInputFormat.addInputPath(job, input); Path outPath = new Path(params.get(OUTPUT), RULESBYCONCLUSION); FileOutputFormat.setOutputPath(job, outPath); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(RuleAggregatorMapper.class); job.setReducerClass(RuleAggregatorReducer.class); job.setPartitionerClass(RulePartitionerByConclusion.class); job.setSortComparatorClass(RulesWritableComparator.class); job.setGroupingComparatorClass(RulesGroupingWritableComparator.class); HadoopUtil.delete(conf, outPath); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:ldbc.snb.datagen.hadoop.HadoopPersonActivityGenerator.java
public void run(String inputFileName) throws AssertionError, Exception { FileSystem fs = FileSystem.get(conf); System.out.println("RANKING"); String rankedFileName = conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/ranked"; HadoopFileRanker hadoopFileRanker = new HadoopFileRanker(conf, TupleKey.class, Person.class, null); hadoopFileRanker.run(inputFileName, rankedFileName); System.out.println("GENERATING"); int numThreads = Integer.parseInt(conf.get("ldbc.snb.datagen.generator.numThreads")); Job job = Job.getInstance(conf, "Person Activity Generator/Serializer"); job.setMapOutputKeyClass(BlockKey.class); job.setMapOutputValueClass(Person.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Person.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(HadoopPersonActivityGeneratorReducer.class); job.setNumReduceTasks(numThreads);/* w ww . j ava 2s .c o m*/ job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setSortComparatorClass(BlockKeyComparator.class); job.setGroupingComparatorClass(BlockKeyGroupComparator.class); job.setPartitionerClass(HadoopBlockPartitioner.class); /** PROFILING OPTIONS **/ //job.setProfileEnabled(true); //job.setProfileParams("-agentlib:hprof=cpu=samples,heap=sites,depth=4,thread=y,format=b,file=%s"); //job.setProfileTaskRange(true,"0-1"); //job.setProfileTaskRange(false,"0-1"); /****/ FileInputFormat.setInputPaths(job, new Path(rankedFileName)); FileOutputFormat.setOutputPath(job, new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux")); long start = System.currentTimeMillis(); try { if (!job.waitForCompletion(true)) { throw new Exception(); } } catch (AssertionError e) { throw e; } System.out.println("Real time to generate activity: " + (System.currentTimeMillis() - start) / 1000.0f); try { fs.delete(new Path(rankedFileName), true); fs.delete(new Path(conf.get("ldbc.snb.datagen.serializer.hadoopDir") + "/aux"), true); } catch (IOException e) { System.err.println(e.getMessage()); e.printStackTrace(); } }
From source file:ldbc.socialnet.dbgen.generator.MRGenerateUsers.java
License:Open Source License
public int runGenerateJob(Configuration conf) throws Exception { FileSystem fs = FileSystem.get(conf); String hadoopDir = new String(conf.get("outputDir") + "/hadoop"); String socialNetDir = new String(conf.get("outputDir") + "/social_network"); int numThreads = Integer.parseInt(conf.get("numThreads")); System.out.println("NUMBER OF THREADS " + numThreads); /// --------- Execute Jobs ------ long start = System.currentTimeMillis(); /// --------------- First job Generating users---------------- printProgress("Starting: Person generation"); conf.set("pass", Integer.toString(0)); Job job = new Job(conf, "SIB Generate Users & 1st Dimension"); job.setMapOutputKeyClass(TupleKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(GenerateUsersMapper.class); job.setMapperClass(GenerateUsersMapper.class); job.setNumReduceTasks(numThreads);/*from w w w .j a v a 2 s. c o m*/ job.setInputFormatClass(NLineInputFormat.class); conf.setInt("mapred.line.input.format.linespermap", 1); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir) + "/mrInputFile"); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib")); job.waitForCompletion(true); /// --------------- Sorting by first dimension ---------------- printProgress("Starting: Sorting by first dimension"); HadoopFileRanker fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib", hadoopDir + "/sibSorting"); fs.delete(new Path(hadoopDir + "/sib"), true); /// --------------- job Generating First dimension Friendships ---------------- printProgress("Starting: Friendship generation 1."); conf.set("pass", Integer.toString(0)); conf.set("dimension", Integer.toString(1)); job = new Job(conf, "SIB Generate Friendship - Interest"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib2")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting"), true); /// --------------- Sorting phase 2 ---------------- printProgress("Starting: Sorting by second dimension"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib2", hadoopDir + "/sibSorting2"); fs.delete(new Path(hadoopDir + "/sib2"), true); /// --------------- Second job Generating Friendships ---------------- printProgress("Starting: Friendship generation 2."); conf.set("pass", Integer.toString(1)); conf.set("dimension", Integer.toString(2)); job = new Job(conf, "SIB Generate Friendship - Interest"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting2")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib3")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting2"), true); /// --------------- Sorting phase 3-------------- printProgress("Starting: Sorting by third dimension"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib3", hadoopDir + "/sibSorting3"); fs.delete(new Path(hadoopDir + "/sib3"), true); /// --------------- Third job Generating Friendships---------------- printProgress("Starting: Friendship generation 3."); conf.set("pass", Integer.toString(2)); conf.set("dimension", Integer.toString(2)); job = new Job(conf, "SIB Generate Friendship - Random"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(DimensionReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting3")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib4")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sibSorting3"), true); /// --------------- Sorting phase 3-------------- printProgress("Starting: Sorting by third dimension (for activity generation)"); fileRanker = new HadoopFileRanker(conf, TupleKey.class, ReducedUserProfile.class); fileRanker.run(hadoopDir + "/sib4", hadoopDir + "/sibSorting4"); fs.delete(new Path(hadoopDir + "/sib4"), true); /// --------------- Fourth job: Serialize static network ---------------- printProgress("Starting: Generating person activity"); job = new Job(conf, "Generate user activity"); job.setMapOutputKeyClass(ComposedKey.class); job.setMapOutputValueClass(ReducedUserProfile.class); job.setOutputKeyClass(TupleKey.class); job.setOutputValueClass(ReducedUserProfile.class); job.setJarByClass(HadoopBlockMapper.class); job.setMapperClass(HadoopBlockMapper.class); job.setReducerClass(UserActivityReducer.class); job.setNumReduceTasks(numThreads); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setPartitionerClass(HadoopBlockPartitioner.class); job.setSortComparatorClass(ComposedKeyComparator.class); job.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job, new Path(hadoopDir + "/sibSorting4")); FileOutputFormat.setOutputPath(job, new Path(hadoopDir + "/sib5")); job.waitForCompletion(true); fs.delete(new Path(hadoopDir + "/sib5"), true); int numEvents = 0; long min = Long.MAX_VALUE; long max = Long.MIN_VALUE; if (conf.getBoolean("updateStreams", false)) { for (int i = 0; i < numThreads; ++i) { int numPartitions = conf.getInt("numUpdatePartitions", 1); for (int j = 0; j < numPartitions; ++j) { /// --------------- Fifth job: Sort update streams ---------------- conf.setInt("mapred.line.input.format.linespermap", 1000000); conf.setInt("reducerId", i); conf.setInt("partitionId", j); conf.set("streamType", "forum"); Job jobForum = new Job(conf, "Soring update streams " + j + " of reducer " + i); jobForum.setMapOutputKeyClass(LongWritable.class); jobForum.setMapOutputValueClass(Text.class); jobForum.setOutputKeyClass(LongWritable.class); jobForum.setOutputValueClass(Text.class); jobForum.setJarByClass(UpdateEventMapper.class); jobForum.setMapperClass(UpdateEventMapper.class); jobForum.setReducerClass(UpdateEventReducer.class); jobForum.setNumReduceTasks(1); jobForum.setInputFormatClass(SequenceFileInputFormat.class); jobForum.setOutputFormatClass(SequenceFileOutputFormat.class); jobForum.setPartitionerClass(UpdateEventPartitioner.class); FileInputFormat.addInputPath(jobForum, new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum")); FileOutputFormat.setOutputPath(jobForum, new Path(hadoopDir + "/sibEnd")); printProgress("Starting: Sorting update streams"); jobForum.waitForCompletion(true); fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_forum"), false); fs.delete(new Path(hadoopDir + "/sibEnd"), true); conf.setInt("mapred.line.input.format.linespermap", 1000000); conf.setInt("reducerId", i); conf.setInt("partitionId", j); conf.set("streamType", "person"); Job jobPerson = new Job(conf, "Soring update streams " + j + " of reducer " + i); jobPerson.setMapOutputKeyClass(LongWritable.class); jobPerson.setMapOutputValueClass(Text.class); jobPerson.setOutputKeyClass(LongWritable.class); jobPerson.setOutputValueClass(Text.class); jobPerson.setJarByClass(UpdateEventMapper.class); jobPerson.setMapperClass(UpdateEventMapper.class); jobPerson.setReducerClass(UpdateEventReducer.class); jobPerson.setNumReduceTasks(1); jobPerson.setInputFormatClass(SequenceFileInputFormat.class); jobPerson.setOutputFormatClass(SequenceFileOutputFormat.class); jobPerson.setPartitionerClass(UpdateEventPartitioner.class); FileInputFormat.addInputPath(jobPerson, new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person")); FileOutputFormat.setOutputPath(jobPerson, new Path(hadoopDir + "/sibEnd")); printProgress("Starting: Sorting update streams"); jobPerson.waitForCompletion(true); fs.delete(new Path(socialNetDir + "/temp_updateStream_" + i + "_" + j + "_person"), false); fs.delete(new Path(hadoopDir + "/sibEnd"), true); if (conf.getBoolean("updateStreams", false)) { Properties properties = new Properties(); FSDataInputStream file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_person.properties")); properties.load(file); if (properties.getProperty("min_write_event_start_time") != null) { Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time")); min = auxMin < min ? auxMin : min; Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time")); max = auxMax > max ? auxMax : max; numEvents += Long.parseLong(properties.getProperty("num_events")); } file.close(); file = fs.open(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_forum.properties")); properties.load(file); if (properties.getProperty("min_write_event_start_time") != null) { Long auxMin = Long.parseLong(properties.getProperty("min_write_event_start_time")); min = auxMin < min ? auxMin : min; Long auxMax = Long.parseLong(properties.getProperty("max_write_event_start_time")); max = auxMax > max ? auxMax : max; numEvents += Long.parseLong(properties.getProperty("num_events")); } file.close(); fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_person.properties"), true); fs.delete(new Path(conf.get("outputDir") + "/social_network/updateStream_" + i + "_" + j + "_forum.properties"), true); } } } if (conf.getBoolean("updateStreams", false)) { OutputStream output = fs .create(new Path(conf.get("outputDir") + "/social_network/updateStream.properties")); output.write(new String("ldbc.snb.interactive.gct_delta_duration:" + conf.get("deltaTime") + "\n") .getBytes()); output.write( new String("ldbc.snb.interactive.min_write_event_start_time:" + min + "\n").getBytes()); output.write( new String("ldbc.snb.interactive.max_write_event_start_time:" + max + "\n").getBytes()); output.write(new String("ldbc.snb.interactive.update_interleave:" + (max - min) / numEvents + "\n") .getBytes()); output.write(new String("ldbc.snb.interactive.num_events:" + numEvents).getBytes()); output.close(); } } /// --------------- Sixth job: Materialize the friends lists ---------------- /* Job job6 = new Job(conf,"Dump the friends lists"); job6.setMapOutputKeyClass(ComposedKey.class); job6.setMapOutputValueClass(ReducedUserProfile.class); job6.setOutputKeyClass(ComposedKey.class); job6.setOutputValueClass(ReducedUserProfile.class); job6.setJarByClass(HadoopBlockMapper.class); job6.setMapperClass(HadoopBlockMapper.class); job6.setReducerClass(FriendListOutputReducer.class); job6.setNumReduceTasks(numThreads); job6.setInputFormatClass(SequenceFileInputFormat.class); job6.setOutputFormatClass(SequenceFileOutputFormat.class); job6.setPartitionerClass(HadoopBlockPartitioner.class); job6.setSortComparatorClass(ComposedKeyComparator.class); job6.setGroupingComparatorClass(ComposedKeyGroupComparator.class); FileInputFormat.setInputPaths(job6, new Path(hadoopDir + "/sibSorting4")); FileOutputFormat.setOutputPath(job6, new Path(hadoopDir + "/job6") ); printProgress("Starting: Materialize friends for substitution parameters"); int resMaterializeFriends = job6.waitForCompletion(true) ? 0 : 1; fs.delete(new Path(hadoopDir + "/sibSorting3"),true); */ long end = System.currentTimeMillis(); System.out.println(((end - start) / 1000) + " total seconds"); for (int i = 0; i < numThreads; ++i) { fs.copyToLocalFile(new Path(socialNetDir + "/m" + i + "factors.txt"), new Path("./")); fs.copyToLocalFile(new Path(socialNetDir + "/m0friendList" + i + ".csv"), new Path("./")); } return 0; }
From source file:license.LicenseDriver.java
public static void main(String[] args) throws Exception { if (args.length != 3) { System.out.println("usage: [students dataset path] [grades dataset path] [output]"); System.exit(-1);/*from www.jav a 2 s . co m*/ } Configuration configuration = new Configuration(); configuration.setClass(ILicenseNameParsingStrategy.class.getName(), LicenseNameWritableParsingStrategy.class, IParsingStrategy.class); configuration.setClass(ILicenseTypeParsingStrategy.class.getName(), LicenseTypeWritableParsingStrategy.class, IParsingStrategy.class); Job job = Job.getInstance(configuration); job.setOutputKeyClass(LicenseKey.class); job.setOutputValueClass(JoinNameAndLicense.class); MultipleInputs.addInputPath(job, new Path(args[0]), NamesWritableInputFormat.class, NamesDetailsMapper.class); MultipleInputs.addInputPath(job, new Path(args[1]), LicensesWritableInputFormat.class, LicensesDetailsMapper.class); job.setReducerClass(LicenseReducer.class); job.setOutputFormatClass(TextOutputFormat.class); job.setPartitionerClass(LicenseKeyPartitioner.class); job.setGroupingComparatorClass(LicenseGroupingComparator.class); FileOutputFormat.setOutputPath(job, new Path(args[2])); job.setJarByClass(LicenseDriver.class); job.submit(); }
From source file:mvm.rya.joinselect.mr.JoinSelectAggregate.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); String inPath1 = conf.get(PROSPECTS_OUTPUTPATH); String inPath2 = conf.get(SPO_OUTPUTPATH); String auths = conf.get(AUTHS); String outPath = conf.get(OUTPUTPATH); assert inPath1 != null && inPath2 != null && outPath != null; Job job = new Job(conf, this.getClass().getSimpleName() + "_" + System.currentTimeMillis()); job.setJarByClass(this.getClass()); conf.setBoolean(MRJobConfig.MAPREDUCE_JOB_USER_CLASSPATH_FIRST, true); JoinSelectStatsUtil.initJoinMRJob(job, inPath1, inPath2, JoinSelectAggregateMapper.class, outPath, auths); job.setSortComparatorClass(JoinSelectSortComparator.class); job.setGroupingComparatorClass(JoinSelectGroupComparator.class); job.setPartitionerClass(JoinSelectPartitioner.class); job.setReducerClass(JoinReducer.class); job.setNumReduceTasks(32);//from w ww .j a v a 2 s . c o m job.waitForCompletion(true); return job.isSuccessful() ? 0 : 1; }
From source file:name.abhijitsarkar.hadoop.join.ReduceSideJoinDriver.java
License:Open Source License
@Override public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = new Job(conf, "reduce-side-join"); job.setJarByClass(getClass());// w ww . j a v a 2 s . c o m job.setPartitionerClass(KeyPartitioner.class); job.setGroupingComparatorClass(KeyGroupingComparator.class); job.setReducerClass(ReduceSideJoinReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(Text.class); job.setOutputFormatClass(TextOutputFormat.class); MultipleInputs.addInputPath(job, new Path(args[0], "customers.txt"), TextInputFormat.class, CustomerMapper.class); MultipleInputs.addInputPath(job, new Path(args[0], "orders.txt"), TextInputFormat.class, OrderMapper.class); job.setMapOutputKeyClass(TaggedKey.class); job.setMapOutputValueClass(Text.class); FileOutputFormat.setOutputPath(job, new Path(args[1])); return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.gridline.zieook.inx.movielens.RowSimilarityZieOok.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();// w ww . j av a 2s .c om addOutputOption(); addOption("numberOfColumns", "r", "Number of columns in the input matrix"); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption("maxSimilaritiesPerRow", "m", "Number of maximum similarities per row (default: " + DEFAULT_MAX_SIMILARITIES_PER_ROW + ')', String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ROW)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } int numberOfColumns = Integer.parseInt(parsedArgs.get("--numberOfColumns")); String similarityClassnameArg = parsedArgs.get("--similarityClassname"); String distributedSimilarityClassname; try { distributedSimilarityClassname = SimilarityType.valueOf(similarityClassnameArg) .getSimilarityImplementationClassName(); } catch (IllegalArgumentException iae) { distributedSimilarityClassname = similarityClassnameArg; } int maxSimilaritiesPerRow = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerRow")); Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path weightsPath = new Path(tempDirPath, "weights"); Path pairwiseSimilarityPath = new Path(tempDirPath, "pairwiseSimilarity"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job weights = prepareJob(inputPath, weightsPath, SequenceFileInputFormat.class, RowWeightMapper.class, VarIntWritable.class, WeightedOccurrence.class, WeightedOccurrencesPerColumnReducer.class, VarIntWritable.class, WeightedOccurrenceArray.class, SequenceFileOutputFormat.class); weights.getConfiguration().set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); weights.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job pairwiseSimilarity = prepareJob(weightsPath, pairwiseSimilarityPath, SequenceFileInputFormat.class, CooccurrencesMapper.class, WeightedRowPair.class, Cooccurrence.class, SimilarityReducer.class, SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class, SequenceFileOutputFormat.class); Configuration pairwiseConf = pairwiseSimilarity.getConfiguration(); pairwiseConf.set(DISTRIBUTED_SIMILARITY_CLASSNAME, distributedSimilarityClassname); pairwiseConf.setInt(NUMBER_OF_COLUMNS, numberOfColumns); pairwiseSimilarity.waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job asMatrix = prepareJob(pairwiseSimilarityPath, outputPath, SequenceFileInputFormat.class, Mapper.class, SimilarityMatrixEntryKey.class, DistributedRowMatrix.MatrixEntryWritable.class, EntriesToVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); asMatrix.setPartitionerClass(HashPartitioner.class); asMatrix.setGroupingComparatorClass( SimilarityMatrixEntryKey.SimilarityMatrixEntryKeyGroupingComparator.class); asMatrix.getConfiguration().setInt(MAX_SIMILARITIES_PER_ROW, maxSimilaritiesPerRow); asMatrix.waitForCompletion(true); } return 0; }
From source file:nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java
License:Apache License
@Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { addInputOption();/*from ww w.ja v a2 s .c o m*/ // addOutputOption(); // no output path, we use a table! addOption("outputtable", "ot", "Output table name"); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this number " + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = parsedArgs.get("--similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser")); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); Path inputPath = getInputPath(); // Path outputPath = getOutputPath(); String outputTable = parsedArgs.get("--outputtable"); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); task.setCurrentJob(itemIDIndex).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); task.setCurrentJob(toUserVector).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); task.setCurrentJob(countUsers).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); /* * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ try { ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath, "-Dmapred.output.dir=" + similarityMatrixPath, "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir", tempDirPath.toString() }); } catch (Exception e) { throw new IllegalStateException("item-item-similarity computation failed", e); } // This step writes the data to a file, we don't want that, it should be written in HBase directly: if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareMostSimilarItems(similarityMatrixPath, outputTable); // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString()); // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); // mostSimilarItems.waitForCompletion(true); task.setCurrentJob(mostSimilarItems).waitForCompletion(Log.isDebugEnabled()); // Job mostSimilarItems = prepareJob(similarityMatrixPath, outputPath, SequenceFileInputFormat.class, // MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, // MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, // TextOutputFormat.class); // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString()); // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); // mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class); // mostSimilarItems.waitForCompletion(true); } return 0; }
From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();//from w w w. j ava 2s .co m addOutputOption(); addOption("numRecommendations", "n", "Number of recommendations per user", String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS)); addOption("usersFile", "u", "File of users to recommend for", null); addOption("itemsFile", "i", "File of items to recommend for", null); addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from " + "the recommendations for that user (optional)", null); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("maxPrefsPerUser", "mp", "Maximum number of preferences considered per user in final recommendation phase", String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ", String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM)); addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this " + "number (default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')', String.valueOf(SimilarityType.SIMILARITY_COOCCURRENCE)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); int numRecommendations = Integer.parseInt(parsedArgs.get("--numRecommendations")); String usersFile = parsedArgs.get("--usersFile"); String itemsFile = parsedArgs.get("--itemsFile"); String filterFile = parsedArgs.get("--filterFile"); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser")); int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser")); int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); String similarityClassname = parsedArgs.get("--similarityClassname"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); Path prePartialMultiplyPath1 = new Path(tempDirPath, "prePartialMultiply1"); Path prePartialMultiplyPath2 = new Path(tempDirPath, "prePartialMultiply2"); Path explicitFilterPath = new Path(tempDirPath, "explicitFilterPath"); Path partialMultiplyPath = new Path(tempDirPath, "partialMultiply"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); task.setCurrentJob(itemIDIndex).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData); toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); task.setCurrentJob(toUserVector).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); task.setCurrentJob(countUsers).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ try { ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { // "--input", itemUserMatrixPath.toString(), // "--output", similarityMatrixPath.toString(), // "--numberOfColumns", String.valueOf(numberOfUsers), // "--similarityClassname", similarityClassname, // "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1), // "--tempDir", tempDirPath.toString() }); } catch (Exception e) { throw new IllegalStateException("item-item-similarity computation failed", e); } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job prePartialMultiply1 = prepareJob(similarityMatrixPath, prePartialMultiplyPath1, SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); task.setCurrentJob(prePartialMultiply1).waitForCompletion(true); Job prePartialMultiply2 = prepareJob(userVectorPath, prePartialMultiplyPath2, SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); if (usersFile != null) { prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile); } prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser); task.setCurrentJob(prePartialMultiply2).waitForCompletion(true); Job partialMultiply = prepareJob(new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2), partialMultiplyPath, SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class, VectorOrPrefWritable.class, ToVectorAndPrefReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); /* necessary to make this job (having a combined input path) work on Amazon S3 */ Configuration partialMultiplyConf = partialMultiply.getConfiguration(); FileSystem fs = FileSystem.get(tempDirPath.toUri(), partialMultiplyConf); prePartialMultiplyPath1 = prePartialMultiplyPath1.makeQualified(fs); prePartialMultiplyPath2 = prePartialMultiplyPath2.makeQualified(fs); FileInputFormat.setInputPaths(partialMultiply, prePartialMultiplyPath1, prePartialMultiplyPath2); task.setCurrentJob(partialMultiply).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* convert the user/item pairs to filter if a filterfile has been specified */ if (filterFile != null) { Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class, ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class, ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); task.setCurrentJob(itemFiltering).waitForCompletion(true); } String aggregateAndRecommendInput = partialMultiplyPath.toString(); if (filterFile != null) { aggregateAndRecommendInput += "," + explicitFilterPath; } Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class, SequenceFileOutputFormat.class); Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration(); if (itemsFile != null) { aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile); } if (filterFile != null) { /* necessary to make this job (having a combined input path) work on Amazon S3 */ FileSystem fs = FileSystem.get(tempDirPath.toUri(), aggregateAndRecommendConf); partialMultiplyPath = partialMultiplyPath.makeQualified(fs); explicitFilterPath = explicitFilterPath.makeQualified(fs); FileInputFormat.setInputPaths(aggregateAndRecommend, partialMultiplyPath, explicitFilterPath); } setIOSort(aggregateAndRecommend); aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH, itemIDIndexPath.toString()); aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations); aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData); task.setCurrentJob(aggregateAndRecommend).waitForCompletion(true); } return 0; }