List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass
public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException
From source file:org.apache.hadoop.examples.Grep.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 3) { System.out.println("Grep <inDir> <outDir> <regex> [<group>]"); ToolRunner.printGenericCommandUsage(System.out); return 2; }/* w ww .j a va 2 s .c om*/ Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = getConf(); conf.set(RegexMapper.PATTERN, args[2]); if (args.length == 4) conf.set(RegexMapper.GROUP, args[3]); Job grepJob = Job.getInstance(conf); try { grepJob.setJobName("grep-search"); grepJob.setJarByClass(Grep.class); FileInputFormat.setInputPaths(grepJob, args[0]); grepJob.setMapperClass(RegexMapper.class); grepJob.setCombinerClass(LongSumReducer.class); grepJob.setReducerClass(LongSumReducer.class); FileOutputFormat.setOutputPath(grepJob, tempDir); grepJob.setOutputFormatClass(SequenceFileOutputFormat.class); grepJob.setOutputKeyClass(Text.class); grepJob.setOutputValueClass(LongWritable.class); grepJob.waitForCompletion(true); Job sortJob = Job.getInstance(conf); sortJob.setJobName("grep-sort"); sortJob.setJarByClass(Grep.class); FileInputFormat.setInputPaths(sortJob, tempDir); sortJob.setInputFormatClass(SequenceFileInputFormat.class); sortJob.setMapperClass(InverseMapper.class); sortJob.setNumReduceTasks(1); // write a single file FileOutputFormat.setOutputPath(sortJob, new Path(args[1])); sortJob.setSortComparatorClass( // sort by decreasing freq LongWritable.DecreasingComparator.class); sortJob.waitForCompletion(true); } finally { FileSystem.get(conf).delete(tempDir, true); } return 0; }
From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopSortingTest.java
License:Apache License
/** * @throws Exception If failed.// w ww. j a va 2 s . c o m */ public void testSortSimple() throws Exception { // Generate test data. Job job = Job.getInstance(); job.setInputFormatClass(InFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setMapperClass(Mapper.class); job.setNumReduceTasks(0); setupFileSystems(job.getConfiguration()); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_INPUT)); X.printerrln("Data generation started."); grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())) .get(180000); X.printerrln("Data generation complete."); // Run main map-reduce job. job = Job.getInstance(); setupFileSystems(job.getConfiguration()); job.getConfiguration().set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, JavaSerialization.class.getName() + "," + WritableSerialization.class.getName()); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + PATH_INPUT)); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setSortComparatorClass(JavaSerializationComparator.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(2); job.setMapOutputKeyClass(UUID.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); X.printerrln("Job started."); grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 2), createJobInfo(job.getConfiguration())) .get(180000); X.printerrln("Job complete."); // Check result. Path outDir = new Path(igfsScheme() + PATH_OUTPUT); AbstractFileSystem fs = AbstractFileSystem.get(new URI(igfsScheme()), job.getConfiguration()); for (FileStatus file : fs.listStatus(outDir)) { X.printerrln("__ file: " + file); if (file.getLen() == 0) continue; FSDataInputStream in = fs.open(file.getPath()); Scanner sc = new Scanner(in); UUID prev = null; while (sc.hasNextLine()) { UUID next = UUID.fromString(sc.nextLine()); // X.printerrln("___ check: " + next); if (prev != null) assertTrue(prev.compareTo(next) < 0); prev = next; } } }
From source file:org.apache.ignite.internal.processors.hadoop.HadoopSortingTest.java
License:Apache License
/** * @throws Exception If failed.//from w w w. ja va2s . c o m */ public void testSortSimple() throws Exception { // Generate test data. Job job = Job.getInstance(); job.setInputFormatClass(InFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); job.setMapperClass(Mapper.class); job.setNumReduceTasks(0); setupFileSystems(job.getConfiguration()); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_INPUT)); X.printerrln("Data generation started."); grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())) .get(180000); X.printerrln("Data generation complete."); // Run main map-reduce job. job = Job.getInstance(); setupFileSystems(job.getConfiguration()); job.getConfiguration().set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, JavaSerialization.class.getName() + "," + WritableSerialization.class.getName()); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + PATH_INPUT)); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setSortComparatorClass(JavaSerializationComparator.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(2); job.setMapOutputKeyClass(UUID.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); X.printerrln("Job started."); grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 2), createJobInfo(job.getConfiguration())) .get(180000); X.printerrln("Job complete."); // Check result. Path outDir = new Path(igfsScheme() + PATH_OUTPUT); AbstractFileSystem fs = AbstractFileSystem.get(new URI(igfsScheme()), job.getConfiguration()); for (FileStatus file : fs.listStatus(outDir)) { X.printerrln("__ file: " + file); if (file.getLen() == 0) continue; FSDataInputStream in = fs.open(file.getPath()); Scanner sc = new Scanner(in); UUID prev = null; while (sc.hasNextLine()) { UUID next = UUID.fromString(sc.nextLine()); // X.printerrln("___ check: " + next); if (prev != null) assertTrue(prev.compareTo(next) < 0); prev = next; } } }
From source file:org.apache.mahout.utils.SplitInputJob.java
License:Apache License
/** * Run job to downsample, randomly permute and split data into test and * training sets. This job takes a SequenceFile as input and outputs two * SequenceFiles test-r-00000 and training-r-00000 which contain the test and * training sets respectively/*from ww w . j a v a2 s .com*/ * * @param initialConf * @param inputPath * path to input data SequenceFile * @param outputPath * path for output data SequenceFiles * @param keepPct * percentage of key value pairs in input to keep. The rest are * discarded * @param randomSelectionPercent * percentage of key value pairs to allocate to test set. Remainder * are allocated to training set */ @SuppressWarnings("rawtypes") public static void run(Configuration initialConf, Path inputPath, Path outputPath, int keepPct, float randomSelectionPercent) throws IOException, ClassNotFoundException, InterruptedException { int downsamplingFactor = (int) (100.0 / keepPct); initialConf.setInt(DOWNSAMPLING_FACTOR, downsamplingFactor); initialConf.setFloat(RANDOM_SELECTION_PCT, randomSelectionPercent); // Determine class of keys and values FileSystem fs = FileSystem.get(initialConf); SequenceFileDirIterator<? extends WritableComparable, Writable> iterator = new SequenceFileDirIterator<WritableComparable, Writable>( inputPath, PathType.LIST, PathFilters.partFilter(), null, false, fs.getConf()); Class<? extends WritableComparable> keyClass; Class<? extends Writable> valueClass; if (iterator.hasNext()) { Pair<? extends WritableComparable, Writable> pair = iterator.next(); keyClass = pair.getFirst().getClass(); valueClass = pair.getSecond().getClass(); } else { throw new IllegalStateException("Couldn't determine class of the input values"); } Job job = new Job(new Configuration(initialConf)); MultipleOutputs.addNamedOutput(job, TRAINING_TAG, SequenceFileOutputFormat.class, keyClass, valueClass); MultipleOutputs.addNamedOutput(job, TEST_TAG, SequenceFileOutputFormat.class, keyClass, valueClass); job.setJarByClass(SplitInputJob.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.setNumReduceTasks(1); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setMapperClass(SplitInputMapper.class); job.setReducerClass(SplitInputReducer.class); job.setSortComparatorClass(SplitInputComparator.class); job.setOutputKeyClass(keyClass); job.setOutputValueClass(valueClass); job.submit(); boolean succeeded = job.waitForCompletion(true); if (!succeeded) { throw new IllegalStateException("Job failed!"); } }
From source file:org.apache.mrql.GroupByJoinPlan.java
License:Apache License
/** the GroupByJoin operation: * an equi-join combined with a group-by implemented using hashing * @param left_join_key_fnc left join key function from a to k * @param right_join_key_fnc right join key function from b to k * @param left_groupby_fnc left group-by function from a to k1 * @param right_groupby_fnc right group-by function from b to k2 * @param accumulator_fnc accumulator function from (c,(a,b)) to c * @param zero the left zero of accumulator of type c * @param reduce_fnc reduce function from ((k1,k2),c) to d * @param X left data set of type {a} * @param Y right data set of type {b} * @param num_reducers number of reducers * @param n left dimension of the reducer grid * @param m right dimension of the reducer grid * @param stop_counter optional counter used in repeat operation * @return a DataSet that contains the result of type {d} *///from ww w .j a v a2 s.co m public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function Tree right_join_key_fnc, // right join key function Tree left_groupby_fnc, // left group-by function Tree right_groupby_fnc, // right group-by function Tree accumulator_fnc, // accumulator function Tree zero, // the left zero of accumulator Tree reduce_fnc, // reduce function DataSet X, // left data set DataSet Y, // right data set int num_reducers, // number of reducers int n, int m, // dimensions of the reducer grid String stop_counter) // optional counter used in repeat operation throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.join.key.left", left_join_key_fnc.toString()); conf.set("mrql.join.key.right", right_join_key_fnc.toString()); conf.set("mrql.groupby.left", left_groupby_fnc.toString()); conf.set("mrql.groupby.right", right_groupby_fnc.toString()); conf.setInt("mrql.m", m); conf.setInt("mrql.n", n); conf.set("mrql.accumulator", accumulator_fnc.toString()); conf.set("mrql.zero", zero.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(GroupByJoinKey.class); job.setJarByClass(GroupByJoinPlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(GroupByJoinPartitioner.class); job.setSortComparatorClass(GroupByJoinSortComparator.class); job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); job.setReducerClass(JoinReducer.class); if (num_reducers > 0) job.setNumReduceTasks(num_reducers); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = false; return new DataSet(s, c, MapReducePlan.outputRecords(job)); }
From source file:org.apache.mrql.JoinOperation.java
License:Apache License
/** The MapReduce2 physical operator (a reduce-side join) * @param mx left mapper function * @param my right mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param X left data set/*from w w w. ja v a 2s.c om*/ * @param Y right data set * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce2(Tree mx, // left mapper function Tree my, // right mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet X, // left data set DataSet Y, // right data set int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper.left", mx.toString()); conf.set("mrql.mapper.right", my.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(new DataSet[] { X, Y }, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setMapOutputKeyClass(JoinKey.class); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerJoinPartitioner.class); job.setSortComparatorClass(MRContainerSortComparator.class); job.setGroupingComparatorClass(MRContainerGroupingComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); for (DataSource p : X.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class); for (DataSource p : Y.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class); if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc)) System.out.println("Streamed MapReduce2 reducer"); job.setReducerClass(JoinReducer.class); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }
From source file:org.apache.mrql.MapReduceOperation.java
License:Apache License
/** * The MapReduce physical operator/*from w w w .j a va 2 s. c o m*/ * @param map_fnc the mapper function * @param combine_fnc optional in-mapper combiner function * @param reduce_fnc the reducer function * @param acc_fnc optional accumulator function * @param zero optional the zero value for the accumulator * @param source the input data source * @param num_reduces number of reducers * @param stop_counter optional counter used in repeat operation * @param orderp does the result need to be ordered? * @return a new data source that contains the result */ public final static DataSet mapReduce(Tree map_fnc, // mapper function Tree combine_fnc, // optional in-mapper combiner function Tree reduce_fnc, // reducer function Tree acc_fnc, // optional accumulator function Tree zero, // optional the zero value for the accumulator DataSet source, // input data source int num_reduces, // number of reducers String stop_counter, // optional counter used in repeat operation boolean orderp) // does the result need to be ordered? throws Exception { conf = MapReduceEvaluator.clear_configuration(conf); String newpath = new_path(conf); conf.set("mrql.mapper", map_fnc.toString()); if (combine_fnc != null) conf.set("mrql.combiner", combine_fnc.toString()); conf.set("mrql.reducer", reduce_fnc.toString()); if (zero != null) { // will use in-mapper combiner conf.set("mrql.accumulator", acc_fnc.toString()); conf.set("mrql.zero", zero.toString()); } else conf.set("mrql.zero", ""); conf.set("mrql.counter", stop_counter); setupSplits(source, conf); Job job = new Job(conf, newpath); distribute_compiled_arguments(job.getConfiguration()); job.setJarByClass(MapReducePlan.class); job.setOutputKeyClass(MRContainer.class); job.setOutputValueClass(MRContainer.class); job.setPartitionerClass(MRContainerPartitioner.class); job.setSortComparatorClass(MRContainerKeyComparator.class); job.setGroupingComparatorClass(MRContainerKeyComparator.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); for (DataSource p : source.source) MultipleInputs.addInputPath(job, new Path(p.path), (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class); FileOutputFormat.setOutputPath(job, new Path(newpath)); job.setReducerClass(MRReducer.class); if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc)) System.out.println("Streamed MapReduce reducer"); if (num_reduces > 0) job.setNumReduceTasks(num_reduces); job.waitForCompletion(true); long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue(); DataSource s = new BinaryDataSource(newpath, conf); s.to_be_merged = orderp; return new DataSet(s, c, outputRecords(job)); }
From source file:org.apache.rya.accumulo.mr.merge.CopyTool.java
License:Apache License
private int runQueryCopy() throws Exception { log.info("Setting up Copy Tool with a query-based ruleset..."); setup();/* w w w. j a v a2 s. c om*/ if (!useCopyFileOutput) { createChildInstance(conf); } // Set up the configuration final AccumuloRdfConfiguration aconf = new AccumuloRdfConfiguration(conf); aconf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock); aconf.setTablePrefix(tablePrefix); aconf.setFlush(false); ConfigUtils.setIndexers(aconf); // Since we're copying at the statement-level, ignore any given list of tables and determine // which tables we might need to create based on which indexers are desired. final TablePrefixLayoutStrategy prefixStrategy = new TablePrefixLayoutStrategy(tablePrefix); tables.clear(); // Always include core tables tables.add(prefixStrategy.getSpo()); tables.add(prefixStrategy.getOsp()); tables.add(prefixStrategy.getPo()); // Copy namespaces if they exist tables.add(prefixStrategy.getNs()); // Add tables associated with any configured indexers /* TODO: SEE RYA-160 if (aconf.getBoolean(ConfigUtils.USE_FREETEXT, false)) { tables.add(ConfigUtils.getFreeTextDocTablename(conf)); tables.add(ConfigUtils.getFreeTextTermTablename(conf)); } if (aconf.getBoolean(ConfigUtils.USE_GEO, false)) { tables.add(ConfigUtils.getGeoTablename(conf)); } if (aconf.getBoolean(ConfigUtils.USE_TEMPORAL, false)) { tables.add(ConfigUtils.getTemporalTableName(conf)); } if (aconf.getBoolean(ConfigUtils.USE_ENTITY, false)) { tables.add(ConfigUtils.getEntityTableName(conf)); } */ // Ignore anything else, e.g. statistics -- must be recalculated for the child if desired // Extract the ruleset, and copy the namespace table directly final AccumuloQueryRuleset ruleset = new AccumuloQueryRuleset(aconf); ruleset.addTable(prefixStrategy.getNs()); for (final String line : ruleset.toString().split("\n")) { log.info(line); } // Create a Job and configure its input and output final Job job = Job.getInstance(aconf); job.setJarByClass(this.getClass()); setupMultiTableInputFormat(job, ruleset); setupAccumuloOutput(job, ""); if (useCopyFileOutput) { // Configure job for file output job.setJobName("Ruleset-based export to file: " + tablePrefix + " -> " + localBaseOutputDir); // Map (row) to (table+key, key+value) job.setMapperClass(RowRuleMapper.class); job.setMapOutputKeyClass(GroupedRow.class); job.setMapOutputValueClass(GroupedRow.class); // Group according to table and and sort according to key job.setGroupingComparatorClass(GroupedRow.GroupComparator.class); job.setSortComparatorClass(GroupedRow.SortComparator.class); // Reduce ([table+row], rows): output each row to the file for that table, in sorted order job.setReducerClass(MultipleFileReducer.class); job.setOutputKeyClass(Key.class); job.setOutputValueClass(Value.class); } else { // Configure job for table output job.setJobName("Ruleset-based copy: " + tablePrefix + " -> " + childTablePrefix); // Map (row): convert to statement, insert to child (for namespace table, output row directly) job.setMapperClass(AccumuloRyaRuleMapper.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Mutation.class); job.setNumReduceTasks(0); // Create the child tables, so mappers don't try to do this in parallel for (final String parentTable : tables) { final String childTable = parentTable.replaceFirst(tablePrefix, childTablePrefix); createTableIfNeeded(childTable); } } // Run the job and copy files to local filesystem if needed final Date beginTime = new Date(); log.info("Job started: " + beginTime); final boolean success = job.waitForCompletion(true); if (success) { if (useCopyFileOutput) { log.info("Moving data from HDFS to the local file system"); final Path baseOutputPath = new Path(baseOutputDir); for (final FileStatus status : FileSystem.get(conf).listStatus(baseOutputPath)) { if (status.isDirectory()) { final String tableName = status.getPath().getName(); final Path hdfsPath = getPath(baseOutputDir, tableName); final Path localPath = getPath(localBaseOutputDir, tableName); log.info("HDFS directory: " + hdfsPath.toString()); log.info("Local directory: " + localPath.toString()); copyHdfsToLocal(hdfsPath, localPath); } } } final Date endTime = new Date(); log.info("Job finished: " + endTime); log.info("The job took " + (endTime.getTime() - beginTime.getTime()) / 1000 + " seconds."); return 0; } else { log.error("Job failed!!!"); return 1; } }
From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java
License:Apache License
public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory, final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException { final Configuration newConfiguration = new Configuration(configuration); final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null; if (vertexProgramExists) { newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat( (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class)) .getCanonicalName());/*from w w w . java 2 s.c om*/ newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER); } final BaseConfiguration apacheConfiguration = new BaseConfiguration(); apacheConfiguration.setDelimiterParsingDisabled(true); mapReduce.storeState(apacheConfiguration); ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration); final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort(); final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort(); newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class); final Job job = Job.getInstance(newConfiguration, mapReduce.toString()); HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString()); job.setJarByClass(HadoopGraph.class); if (mapSort.isPresent()) job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class); job.setMapperClass(HadoopMap.class); if (mapReduce.doStage(MapReduce.Stage.REDUCE)) { if (mapReduce.doStage(MapReduce.Stage.COMBINE)) job.setCombinerClass(HadoopCombine.class); job.setReducerClass(HadoopReduce.class); } else { if (mapSort.isPresent()) { job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order? } else { job.setNumReduceTasks(0); } } job.setMapOutputKeyClass(ObjectWritable.class); job.setMapOutputValueClass(ObjectWritable.class); job.setOutputKeyClass(ObjectWritable.class); job.setOutputValueClass(ObjectWritable.class); job.setInputFormatClass(GraphFilterInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // if there is no vertex program, then grab the graph from the input location final Path graphPath; if (vertexProgramExists) { graphPath = new Path( Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION))); } else { graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION)); } Path memoryPath = new Path( Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey()))); if (FileSystem.get(newConfiguration).exists(memoryPath)) { FileSystem.get(newConfiguration).delete(memoryPath, true); } FileInputFormat.setInputPaths(job, graphPath); FileOutputFormat.setOutputPath(job, memoryPath); job.waitForCompletion(true); // if there is a reduce sort, we need to run another identity MapReduce job if (reduceSort.isPresent()) { final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort"); reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class); reduceSortJob.setMapperClass(Mapper.class); reduceSortJob.setReducerClass(Reducer.class); reduceSortJob.setMapOutputKeyClass(ObjectWritable.class); reduceSortJob.setMapOutputValueClass(ObjectWritable.class); reduceSortJob.setOutputKeyClass(ObjectWritable.class); reduceSortJob.setOutputValueClass(ObjectWritable.class); reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class); reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class); reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order? FileInputFormat.setInputPaths(reduceSortJob, memoryPath); final Path sortedMemoryPath = new Path(Constants.getMemoryLocation( newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey())); FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath); reduceSortJob.waitForCompletion(true); FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path memoryPath = sortedMemoryPath; } mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath)); }
From source file:org.clueweb.clueweb12.app.BuildDictionary.java
License:Apache License
/** * Runs this tool./*from w ww . j ava 2 s . com*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); options.addOption( OptionBuilder.withArgName("num").hasArg().withDescription("number of terms").create(COUNT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION) || !cmdline.hasOption(COUNT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); Configuration conf = getConf(); conf.set(HADOOP_OUTPUT_OPTION, output); conf.setInt(HADOOP_TERMS_COUNT_OPTION, Integer.parseInt(cmdline.getOptionValue(COUNT_OPTION))); conf.set("mapreduce.map.memory.mb", "2048"); conf.set("mapreduce.map.java.opts", "-Xmx2048m"); conf.set("mapreduce.reduce.memory.mb", "2048"); conf.set("mapreduce.reduce.java.opts", "-Xmx2048m"); Job job = new Job(conf, BuildDictionary.class.getSimpleName() + ":" + input); job.setJarByClass(BuildDictionary.class); job.setNumReduceTasks(1); FileInputFormat.setInputPaths(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(NullOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(PairOfIntLong.class); job.setOutputKeyClass(Text.class); job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class); job.setMapperClass(Mapper.class); job.setReducerClass(MyReducer.class); FileSystem.get(getConf()).delete(new Path(output), true); long startTime = System.currentTimeMillis(); job.waitForCompletion(true); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }