List of usage examples for org.apache.hadoop.mapreduce Job setCombinerClass
public void setCombinerClass(Class<? extends Reducer> cls) throws IllegalStateException
From source file:nl.basjes.hadoop.examples.WordCount.java
License:Apache License
public int run(String[] args) throws Exception { Job job = Job.getInstance(getConf()); job.setJobName("Wordcount"); job.setJarByClass(getClass());/*w w w . jav a 2 s .c o m*/ FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); if (args.length > 2 && "-s".equals(args[2])) { if (args.length < 4) { System.out.println("Invalid parameters. Usage: -s <split.maxsize>"); return -1; } job.getConfiguration().set("io.compression.codecs", "nl.basjes.hadoop.io.compress.SplittableGzipCodec"); job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.minsize", Long.parseLong(args[3]) - 10000); job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.maxsize", Long.parseLong(args[3])); job.setJobName("Wordcount-" + args[3]); } job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(WordSplittingMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); return (job.waitForCompletion(true) ? 1 : 0); }
From source file:nl.basjes.hadoop.io.input.Wordcount.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); return 2; }//from ww w. j a v a 2 s . c o m conf.set("nl.basjes.parse.apachehttpdlogline.format", logFormat); // A ',' separated list of fields conf.set("nl.basjes.parse.apachehttpdlogline.fields", "STRING:request.status.last"); Job job = Job.getInstance(conf, "word count"); job.setJarByClass(Wordcount.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); job.setInputFormatClass(ApacheHttpdLogfileInputFormat.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); // configuration should contain reference to your namenode FileSystem fs = FileSystem.get(conf); // true stands for recursively deleting the folder you gave Path outputPath = new Path(otherArgs[1]); fs.delete(outputPath, true); FileOutputFormat.setOutputPath(job, outputPath); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); if (job.waitForCompletion(true)) { return 0; } return 1; }
From source file:nl.cwi.hadoop.kba.stat.CountGenres.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;// w ww. j av a2 s .c o m String out = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else { other_args.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0) { return printUsage(); } if (in == null || out == null) return printUsage(); LOG.info("Tool name: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); Job job = new Job(conf, "Count genres"); job.setJarByClass(CountGenres.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); job.setInputFormatClass(ThriftFileInputFormat.class); job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); return job.waitForCompletion(true) ? 0 : 1; }
From source file:nl.cwi.kba2013.apps.AnnotationExtractor.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from www . ja v a2 s. c o m String out = null; String annoFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-q".equals(args[i])) { } else if ("-a".equals(args[i])) { annoFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(ANNOFILEPATH_HDFS, new Path(annoFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); FileSystem fs = FileSystem.get(conf); // Lookup required data from the topic file Job job = new Job(conf, "Annotation Extractor"); job.setJarByClass(AnnotationExtractor.class); // some weird issues with Thrift classes in the Hadoop distro. //job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(annoFile) + "#" + ANNOFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(TextInputFormat.class); //job.setMapperClass(MyMapper.class); FileInputFormat.addInputPath(job, new Path(in)); //job.setMapOutputKeyClass(Text.class); //job.setMapOutputValueClass(Text.class); job.setCombinerClass(MyReducer.class); //job.setReducerClass(MyReducer.class); job.setNumReduceTasks(1); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Let's go int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KBANameVariantMatchTHERank.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;/*from w ww . j a v a2s. c om*/ String out = null; String labelsFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || labelsFile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KBANameVariantMatchTHERank.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(50); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.cwi.kba2013.apps.KBANER.java
License:Apache License
@Override public int run(String[] args) throws Exception { String in = null;//from w ww. j a v a 2 s .c o m String out = null; String labelsFile = null; List<String> other_args = new ArrayList<String>(); for (int i = 0; i < args.length; ++i) { try { if ("-i".equals(args[i])) { in = args[++i]; } else if ("-o".equals(args[i])) { out = args[++i]; } else if ("-l".equals(args[i])) { labelsFile = args[++i]; } else if ("-h".equals(args[i]) || "--help".equals(args[i])) { return printUsage(); } else { other_args.add(args[i]); } } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); } } if (other_args.size() > 0 || in == null || out == null || labelsFile == null) return printUsage(); LOG.info("Tool: " + this.getClass().getName()); LOG.info(" - input path: " + in); LOG.info(" - output path: " + out); Configuration conf = getConf(); conf.set(LABELSFILEPATH_HDFS, new Path(labelsFile).toUri().toString()); // set time conf.setLong("mapred.task.timeout", 40 * 600000); conf.set("mapred.map.child.java.opts", "-Xmx4g -XX:-UseGCOverheadLimit"); conf.set("mapred.child.java.opts", "-Xmx4096m"); Job job = new Job(conf, "Feature Extractor"); job.setJarByClass(KBANER.class); // some weird issues with Thrift classes in the Hadoop distro. job.setUserClassesTakesPrecedence(true); // make the query file available to each mapper. DistributedCache.addCacheFile(new URI(new Path(labelsFile) + "#" + LABELSFILEPATH_HDFS), job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); job.setInputFormatClass(ThriftFileInputFormat.class); //job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(MyMapper.class); job.setCombinerClass(MyReducer.class); job.setReducerClass(MyReducer.class); FileInputFormat.addInputPath(job, new Path(in)); job.setNumReduceTasks(0); FileSystem.get(conf).delete(new Path(out), true); TextOutputFormat.setOutputPath(job, new Path(out)); //job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); //job.setOutputValueClass(StreamItemWritable.class); job.setOutputValueClass(Text.class); //LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); int status = job.waitForCompletion(true) ? 0 : 1; return status; }
From source file:nl.gridline.zieook.runners.cf.ItemSimilarityJobZieook.java
License:Apache License
@Override public int run(String[] args) throws IOException, InterruptedException, ClassNotFoundException { addInputOption();//from w w w . ja v a2 s . c o m // addOutputOption(); // no output path, we use a table! addOption("outputtable", "ot", "Output table name"); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')'); addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar items per item to this number " + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM)); addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this number " + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } String similarityClassName = parsedArgs.get("--similarityClassname"); int maxSimilarItemsPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser")); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); Path inputPath = getInputPath(); // Path outputPath = getOutputPath(); String outputTable = parsedArgs.get("--outputtable"); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); task.setCurrentJob(itemIDIndex).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, booleanData); toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); task.setCurrentJob(toUserVector).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); task.setCurrentJob(countUsers).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); /* * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ try { ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { "-Dmapred.input.dir=" + itemUserMatrixPath, "-Dmapred.output.dir=" + similarityMatrixPath, "--numberOfColumns", String.valueOf(numberOfUsers), "--similarityClassname", similarityClassName, "--maxSimilaritiesPerRow", String.valueOf(maxSimilarItemsPerItem + 1), "--tempDir", tempDirPath.toString() }); } catch (Exception e) { throw new IllegalStateException("item-item-similarity computation failed", e); } // This step writes the data to a file, we don't want that, it should be written in HBase directly: if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job mostSimilarItems = prepareMostSimilarItems(similarityMatrixPath, outputTable); // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString()); // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); // mostSimilarItems.waitForCompletion(true); task.setCurrentJob(mostSimilarItems).waitForCompletion(Log.isDebugEnabled()); // Job mostSimilarItems = prepareJob(similarityMatrixPath, outputPath, SequenceFileInputFormat.class, // MostSimilarItemPairsMapper.class, EntityEntityWritable.class, DoubleWritable.class, // MostSimilarItemPairsReducer.class, EntityEntityWritable.class, DoubleWritable.class, // TextOutputFormat.class); // Configuration mostSimilarItemsConf = mostSimilarItems.getConfiguration(); // mostSimilarItemsConf.set(ITEM_ID_INDEX_PATH_STR, itemIDIndexPath.toString()); // mostSimilarItemsConf.setInt(MAX_SIMILARITIES_PER_ITEM, maxSimilarItemsPerItem); // mostSimilarItems.setCombinerClass(MostSimilarItemPairsReducer.class); // mostSimilarItems.waitForCompletion(true); } return 0; }
From source file:nl.gridline.zieook.runners.cf.RecommenderJobZieOok.java
License:Apache License
@Override public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException { addInputOption();/*from ww w. jav a2 s. co m*/ addOutputOption(); addOption("numRecommendations", "n", "Number of recommendations per user", String.valueOf(AggregateAndRecommendReducer.DEFAULT_NUM_RECOMMENDATIONS)); addOption("usersFile", "u", "File of users to recommend for", null); addOption("itemsFile", "i", "File of items to recommend for", null); addOption("filterFile", "f", "File containing comma-separated userID,itemID pairs. Used to exclude the item from " + "the recommendations for that user (optional)", null); addOption("booleanData", "b", "Treat input as without pref values", Boolean.FALSE.toString()); addOption("maxPrefsPerUser", "mp", "Maximum number of preferences considered per user in final recommendation phase", String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED)); addOption("minPrefsPerUser", "mp", "ignore users with less preferences than this in the similarity computation " + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', String.valueOf(DEFAULT_MIN_PREFS_PER_USER)); addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities considered per item ", String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM)); addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of cooccurrences per item to this " + "number (default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM)); addOption("similarityClassname", "s", "Name of distributed similarity class to instantiate, alternatively use " + "one of the predefined similarities (" + SimilarityType.listEnumNames() + ')', String.valueOf(SimilarityType.SIMILARITY_COOCCURRENCE)); Map<String, String> parsedArgs = parseArguments(args); if (parsedArgs == null) { return -1; } Path inputPath = getInputPath(); Path outputPath = getOutputPath(); Path tempDirPath = new Path(parsedArgs.get("--tempDir")); int numRecommendations = Integer.parseInt(parsedArgs.get("--numRecommendations")); String usersFile = parsedArgs.get("--usersFile"); String itemsFile = parsedArgs.get("--itemsFile"); String filterFile = parsedArgs.get("--filterFile"); boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData")); int maxPrefsPerUser = Integer.parseInt(parsedArgs.get("--maxPrefsPerUser")); int minPrefsPerUser = Integer.parseInt(parsedArgs.get("--minPrefsPerUser")); int maxSimilaritiesPerItem = Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem")); int maxCooccurrencesPerItem = Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem")); String similarityClassname = parsedArgs.get("--similarityClassname"); Path userVectorPath = new Path(tempDirPath, "userVectors"); Path itemIDIndexPath = new Path(tempDirPath, "itemIDIndex"); Path countUsersPath = new Path(tempDirPath, "countUsers"); Path itemUserMatrixPath = new Path(tempDirPath, "itemUserMatrix"); Path similarityMatrixPath = new Path(tempDirPath, "similarityMatrix"); Path prePartialMultiplyPath1 = new Path(tempDirPath, "prePartialMultiply1"); Path prePartialMultiplyPath2 = new Path(tempDirPath, "prePartialMultiply2"); Path explicitFilterPath = new Path(tempDirPath, "explicitFilterPath"); Path partialMultiplyPath = new Path(tempDirPath, "partialMultiply"); AtomicInteger currentPhase = new AtomicInteger(); if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job itemIDIndex = prepareJob(inputPath, itemIDIndexPath, TextInputFormat.class, ItemIDIndexMapper.class, VarIntWritable.class, VarLongWritable.class, ItemIDIndexReducer.class, VarIntWritable.class, VarLongWritable.class, SequenceFileOutputFormat.class); itemIDIndex.setCombinerClass(ItemIDIndexReducer.class); task.setCurrentJob(itemIDIndex).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job toUserVector = prepareJob(inputPath, userVectorPath, TextInputFormat.class, ToItemPrefsMapper.class, VarLongWritable.class, booleanData ? VarLongWritable.class : EntityPrefWritable.class, ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData); toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER, minPrefsPerUser); task.setCurrentJob(toUserVector).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job countUsers = prepareJob(userVectorPath, countUsersPath, SequenceFileInputFormat.class, CountUsersMapper.class, CountUsersKeyWritable.class, VarLongWritable.class, CountUsersReducer.class, VarIntWritable.class, NullWritable.class, TextOutputFormat.class); countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class); countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class); task.setCurrentJob(countUsers).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job maybePruneAndTransponse = prepareJob(userVectorPath, itemUserMatrixPath, SequenceFileInputFormat.class, MaybePruneRowsMapper.class, IntWritable.class, DistributedRowMatrix.MatrixEntryWritable.class, ToItemVectorsReducer.class, IntWritable.class, VectorWritable.class, SequenceFileOutputFormat.class); maybePruneAndTransponse.getConfiguration().setInt(MaybePruneRowsMapper.MAX_COOCCURRENCES, maxCooccurrencesPerItem); task.setCurrentJob(maybePruneAndTransponse).waitForCompletion(true); } int numberOfUsers = TasteHadoopUtils.readIntFromFile(getConf(), countUsersPath); if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* * Once DistributedRowMatrix uses the hadoop 0.20 API, we should refactor this call to something like * new DistributedRowMatrix(...).rowSimilarity(...) */ try { ToolRunner.run(getConf(), new RowSimilarityZieOok(), new String[] { // "--input", itemUserMatrixPath.toString(), // "--output", similarityMatrixPath.toString(), // "--numberOfColumns", String.valueOf(numberOfUsers), // "--similarityClassname", similarityClassname, // "--maxSimilaritiesPerRow", String.valueOf(maxSimilaritiesPerItem + 1), // "--tempDir", tempDirPath.toString() }); } catch (Exception e) { throw new IllegalStateException("item-item-similarity computation failed", e); } } if (shouldRunNextPhase(parsedArgs, currentPhase)) { Job prePartialMultiply1 = prepareJob(similarityMatrixPath, prePartialMultiplyPath1, SequenceFileInputFormat.class, SimilarityMatrixRowWrapperMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); task.setCurrentJob(prePartialMultiply1).waitForCompletion(true); Job prePartialMultiply2 = prepareJob(userVectorPath, prePartialMultiplyPath2, SequenceFileInputFormat.class, UserVectorSplitterMapper.class, VarIntWritable.class, VectorOrPrefWritable.class, Reducer.class, VarIntWritable.class, VectorOrPrefWritable.class, SequenceFileOutputFormat.class); if (usersFile != null) { prePartialMultiply2.getConfiguration().set(UserVectorSplitterMapper.USERS_FILE, usersFile); } prePartialMultiply2.getConfiguration().setInt(UserVectorSplitterMapper.MAX_PREFS_PER_USER_CONSIDERED, maxPrefsPerUser); task.setCurrentJob(prePartialMultiply2).waitForCompletion(true); Job partialMultiply = prepareJob(new Path(prePartialMultiplyPath1 + "," + prePartialMultiplyPath2), partialMultiplyPath, SequenceFileInputFormat.class, Mapper.class, VarIntWritable.class, VectorOrPrefWritable.class, ToVectorAndPrefReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); /* necessary to make this job (having a combined input path) work on Amazon S3 */ Configuration partialMultiplyConf = partialMultiply.getConfiguration(); FileSystem fs = FileSystem.get(tempDirPath.toUri(), partialMultiplyConf); prePartialMultiplyPath1 = prePartialMultiplyPath1.makeQualified(fs); prePartialMultiplyPath2 = prePartialMultiplyPath2.makeQualified(fs); FileInputFormat.setInputPaths(partialMultiply, prePartialMultiplyPath1, prePartialMultiplyPath2); task.setCurrentJob(partialMultiply).waitForCompletion(true); } if (shouldRunNextPhase(parsedArgs, currentPhase)) { /* convert the user/item pairs to filter if a filterfile has been specified */ if (filterFile != null) { Job itemFiltering = prepareJob(new Path(filterFile), explicitFilterPath, TextInputFormat.class, ItemFilterMapper.class, VarLongWritable.class, VarLongWritable.class, ItemFilterAsVectorAndPrefsReducer.class, VarIntWritable.class, VectorAndPrefsWritable.class, SequenceFileOutputFormat.class); task.setCurrentJob(itemFiltering).waitForCompletion(true); } String aggregateAndRecommendInput = partialMultiplyPath.toString(); if (filterFile != null) { aggregateAndRecommendInput += "," + explicitFilterPath; } Job aggregateAndRecommend = prepareJob(new Path(aggregateAndRecommendInput), outputPath, SequenceFileInputFormat.class, PartialMultiplyMapper.class, VarLongWritable.class, PrefAndSimilarityColumnWritable.class, AggregateAndRecommendReducer.class, VarLongWritable.class, RecommendedItemsWritable.class, SequenceFileOutputFormat.class); Configuration aggregateAndRecommendConf = aggregateAndRecommend.getConfiguration(); if (itemsFile != null) { aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMS_FILE, itemsFile); } if (filterFile != null) { /* necessary to make this job (having a combined input path) work on Amazon S3 */ FileSystem fs = FileSystem.get(tempDirPath.toUri(), aggregateAndRecommendConf); partialMultiplyPath = partialMultiplyPath.makeQualified(fs); explicitFilterPath = explicitFilterPath.makeQualified(fs); FileInputFormat.setInputPaths(aggregateAndRecommend, partialMultiplyPath, explicitFilterPath); } setIOSort(aggregateAndRecommend); aggregateAndRecommendConf.set(AggregateAndRecommendReducer.ITEMID_INDEX_PATH, itemIDIndexPath.toString()); aggregateAndRecommendConf.setInt(AggregateAndRecommendReducer.NUM_RECOMMENDATIONS, numRecommendations); aggregateAndRecommendConf.setBoolean(BOOLEAN_DATA, booleanData); task.setCurrentJob(aggregateAndRecommend).waitForCompletion(true); } return 0; }
From source file:nl.utwente.bigdata.GoalPlayerCount.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: GoalPlayerCount <in> [<in>...] <out>"); System.exit(2);//www .ja v a 2 s .c om } Job job = new Job(conf, "GoldenBoot GoalPlayerCount"); job.setJarByClass(GoalPlayerCount.class); job.setMapperClass(CountMapper.class); job.setCombinerClass(CountReducer.class); job.setReducerClass(CountReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputKeyClass(TextInputFormat.class); job.setOutputValueClass(IntWritable.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:nl.utwente.bigdata.GoalScorerDefiner.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length < 2) { System.err.println("Usage: userCount <in> [<in>...] <out>"); System.exit(2);//from ww w . j ava 2 s .c o m } Job job = new Job(conf, "GoalScorerDefiner"); job.setJarByClass(GoalScorerDefiner.class); job.setMapperClass(GoalScorerDefiner.ScoreMapper.class); job.setCombinerClass(GoalScorerDefiner.ScoreReducer.class); job.setReducerClass(GoalScorerDefiner.ScoreReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); for (int i = 0; i < otherArgs.length - 1; ++i) { FileInputFormat.addInputPath(job, new Path(otherArgs[i])); } FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }