Example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass

List of usage examples for org.apache.hadoop.mapreduce Job setSortComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass.

Prototype

public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException 

Source Link

Document

Define the comparator that controls how the keys are sorted before they are passed to the Reducer .

Usage

From source file:org.apache.hadoop.examples.Grep.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 3) {
        System.out.println("Grep <inDir> <outDir> <regex> [<group>]");
        ToolRunner.printGenericCommandUsage(System.out);
        return 2;
    }/* w ww .j a va 2  s .c om*/

    Path tempDir = new Path("grep-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    Configuration conf = getConf();
    conf.set(RegexMapper.PATTERN, args[2]);
    if (args.length == 4)
        conf.set(RegexMapper.GROUP, args[3]);

    Job grepJob = Job.getInstance(conf);

    try {

        grepJob.setJobName("grep-search");
        grepJob.setJarByClass(Grep.class);

        FileInputFormat.setInputPaths(grepJob, args[0]);

        grepJob.setMapperClass(RegexMapper.class);

        grepJob.setCombinerClass(LongSumReducer.class);
        grepJob.setReducerClass(LongSumReducer.class);

        FileOutputFormat.setOutputPath(grepJob, tempDir);
        grepJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        grepJob.setOutputKeyClass(Text.class);
        grepJob.setOutputValueClass(LongWritable.class);

        grepJob.waitForCompletion(true);

        Job sortJob = Job.getInstance(conf);
        sortJob.setJobName("grep-sort");
        sortJob.setJarByClass(Grep.class);

        FileInputFormat.setInputPaths(sortJob, tempDir);
        sortJob.setInputFormatClass(SequenceFileInputFormat.class);

        sortJob.setMapperClass(InverseMapper.class);

        sortJob.setNumReduceTasks(1); // write a single file
        FileOutputFormat.setOutputPath(sortJob, new Path(args[1]));
        sortJob.setSortComparatorClass( // sort by decreasing freq
                LongWritable.DecreasingComparator.class);

        sortJob.waitForCompletion(true);
    } finally {
        FileSystem.get(conf).delete(tempDir, true);
    }
    return 0;
}

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopSortingTest.java

License:Apache License

/**
 * @throws Exception If failed.// w ww.  j a  va  2  s .  c  o  m
 */
public void testSortSimple() throws Exception {
    // Generate test data.
    Job job = Job.getInstance();

    job.setInputFormatClass(InFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Mapper.class);
    job.setNumReduceTasks(0);

    setupFileSystems(job.getConfiguration());

    FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_INPUT));

    X.printerrln("Data generation started.");

    grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration()))
            .get(180000);

    X.printerrln("Data generation complete.");

    // Run main map-reduce job.
    job = Job.getInstance();

    setupFileSystems(job.getConfiguration());

    job.getConfiguration().set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY,
            JavaSerialization.class.getName() + "," + WritableSerialization.class.getName());

    FileInputFormat.setInputPaths(job, new Path(igfsScheme() + PATH_INPUT));
    FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

    job.setSortComparatorClass(JavaSerializationComparator.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setNumReduceTasks(2);

    job.setMapOutputKeyClass(UUID.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    X.printerrln("Job started.");

    grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 2), createJobInfo(job.getConfiguration()))
            .get(180000);

    X.printerrln("Job complete.");

    // Check result.
    Path outDir = new Path(igfsScheme() + PATH_OUTPUT);

    AbstractFileSystem fs = AbstractFileSystem.get(new URI(igfsScheme()), job.getConfiguration());

    for (FileStatus file : fs.listStatus(outDir)) {
        X.printerrln("__ file: " + file);

        if (file.getLen() == 0)
            continue;

        FSDataInputStream in = fs.open(file.getPath());

        Scanner sc = new Scanner(in);

        UUID prev = null;

        while (sc.hasNextLine()) {
            UUID next = UUID.fromString(sc.nextLine());

            //                X.printerrln("___ check: " + next);

            if (prev != null)
                assertTrue(prev.compareTo(next) < 0);

            prev = next;
        }
    }
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopSortingTest.java

License:Apache License

/**
 * @throws Exception If failed.//from   w  w w. ja va2s . c  o  m
 */
public void testSortSimple() throws Exception {
    // Generate test data.
    Job job = Job.getInstance();

    job.setInputFormatClass(InFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Mapper.class);
    job.setNumReduceTasks(0);

    setupFileSystems(job.getConfiguration());

    FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_INPUT));

    X.printerrln("Data generation started.");

    grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration()))
            .get(180000);

    X.printerrln("Data generation complete.");

    // Run main map-reduce job.
    job = Job.getInstance();

    setupFileSystems(job.getConfiguration());

    job.getConfiguration().set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY,
            JavaSerialization.class.getName() + "," + WritableSerialization.class.getName());

    FileInputFormat.setInputPaths(job, new Path(igfsScheme() + PATH_INPUT));
    FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

    job.setSortComparatorClass(JavaSerializationComparator.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    job.setNumReduceTasks(2);

    job.setMapOutputKeyClass(UUID.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    X.printerrln("Job started.");

    grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 2), createJobInfo(job.getConfiguration()))
            .get(180000);

    X.printerrln("Job complete.");

    // Check result.
    Path outDir = new Path(igfsScheme() + PATH_OUTPUT);

    AbstractFileSystem fs = AbstractFileSystem.get(new URI(igfsScheme()), job.getConfiguration());

    for (FileStatus file : fs.listStatus(outDir)) {
        X.printerrln("__ file: " + file);

        if (file.getLen() == 0)
            continue;

        FSDataInputStream in = fs.open(file.getPath());

        Scanner sc = new Scanner(in);

        UUID prev = null;

        while (sc.hasNextLine()) {
            UUID next = UUID.fromString(sc.nextLine());

            //                X.printerrln("___ check: " + next);

            if (prev != null)
                assertTrue(prev.compareTo(next) < 0);

            prev = next;
        }
    }
}

From source file:org.apache.mahout.utils.SplitInputJob.java

License:Apache License

/**
 * Run job to downsample, randomly permute and split data into test and
 * training sets. This job takes a SequenceFile as input and outputs two
 * SequenceFiles test-r-00000 and training-r-00000 which contain the test and
 * training sets respectively/*from ww w .  j a  v a2 s .com*/
 *
 * @param initialConf
 * @param inputPath
 *          path to input data SequenceFile
 * @param outputPath
 *          path for output data SequenceFiles
 * @param keepPct
 *          percentage of key value pairs in input to keep. The rest are
 *          discarded
 * @param randomSelectionPercent
 *          percentage of key value pairs to allocate to test set. Remainder
 *          are allocated to training set
 */
@SuppressWarnings("rawtypes")
public static void run(Configuration initialConf, Path inputPath, Path outputPath, int keepPct,
        float randomSelectionPercent) throws IOException, ClassNotFoundException, InterruptedException {

    int downsamplingFactor = (int) (100.0 / keepPct);
    initialConf.setInt(DOWNSAMPLING_FACTOR, downsamplingFactor);
    initialConf.setFloat(RANDOM_SELECTION_PCT, randomSelectionPercent);

    // Determine class of keys and values
    FileSystem fs = FileSystem.get(initialConf);

    SequenceFileDirIterator<? extends WritableComparable, Writable> iterator = new SequenceFileDirIterator<WritableComparable, Writable>(
            inputPath, PathType.LIST, PathFilters.partFilter(), null, false, fs.getConf());
    Class<? extends WritableComparable> keyClass;
    Class<? extends Writable> valueClass;
    if (iterator.hasNext()) {
        Pair<? extends WritableComparable, Writable> pair = iterator.next();
        keyClass = pair.getFirst().getClass();
        valueClass = pair.getSecond().getClass();
    } else {
        throw new IllegalStateException("Couldn't determine class of the input values");
    }

    Job job = new Job(new Configuration(initialConf));

    MultipleOutputs.addNamedOutput(job, TRAINING_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
    MultipleOutputs.addNamedOutput(job, TEST_TAG, SequenceFileOutputFormat.class, keyClass, valueClass);
    job.setJarByClass(SplitInputJob.class);
    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);
    job.setNumReduceTasks(1);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setMapperClass(SplitInputMapper.class);
    job.setReducerClass(SplitInputReducer.class);
    job.setSortComparatorClass(SplitInputComparator.class);
    job.setOutputKeyClass(keyClass);
    job.setOutputValueClass(valueClass);
    job.submit();
    boolean succeeded = job.waitForCompletion(true);
    if (!succeeded) {
        throw new IllegalStateException("Job failed!");
    }
}

From source file:org.apache.mrql.GroupByJoinPlan.java

License:Apache License

/** the GroupByJoin operation:
 *      an equi-join combined with a group-by implemented using hashing
 * @param left_join_key_fnc   left join key function from a to k
 * @param right_join_key_fnc  right join key function from b to k
 * @param left_groupby_fnc    left group-by function from a to k1
 * @param right_groupby_fnc   right group-by function from b to k2
 * @param accumulator_fnc     accumulator function from (c,(a,b)) to c
 * @param zero                the left zero of accumulator of type c
 * @param reduce_fnc          reduce function from ((k1,k2),c) to d
 * @param X                   left data set of type {a}
 * @param Y                   right data set of type {b}
 * @param num_reducers        number of reducers
 * @param n                   left dimension of the reducer grid
 * @param m                   right dimension of the reducer grid
 * @param stop_counter        optional counter used in repeat operation
 * @return a DataSet that contains the result of type {d}
 *///from   ww  w .j  a  v  a2 s.co m
public final static DataSet groupByJoin(Tree left_join_key_fnc, // left join key function
        Tree right_join_key_fnc, // right join key function
        Tree left_groupby_fnc, // left group-by function
        Tree right_groupby_fnc, // right group-by function
        Tree accumulator_fnc, // accumulator function
        Tree zero, // the left zero of accumulator
        Tree reduce_fnc, // reduce function
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reducers, // number of reducers
        int n, int m, // dimensions of the reducer grid
        String stop_counter) // optional counter used in repeat operation
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.join.key.left", left_join_key_fnc.toString());
    conf.set("mrql.join.key.right", right_join_key_fnc.toString());
    conf.set("mrql.groupby.left", left_groupby_fnc.toString());
    conf.set("mrql.groupby.right", right_groupby_fnc.toString());
    conf.setInt("mrql.m", m);
    conf.setInt("mrql.n", n);
    conf.set("mrql.accumulator", accumulator_fnc.toString());
    conf.set("mrql.zero", zero.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(GroupByJoinKey.class);
    job.setJarByClass(GroupByJoinPlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(GroupByJoinPartitioner.class);
    job.setSortComparatorClass(GroupByJoinSortComparator.class);
    job.setGroupingComparatorClass(GroupByJoinGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    job.setReducerClass(JoinReducer.class);
    if (num_reducers > 0)
        job.setNumReduceTasks(num_reducers);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = false;
    return new DataSet(s, c, MapReducePlan.outputRecords(job));
}

From source file:org.apache.mrql.JoinOperation.java

License:Apache License

/** The MapReduce2 physical operator (a reduce-side join)
 * @param mx             left mapper function
 * @param my             right mapper function
 * @param combine_fnc    optional in-mapper combiner function
 * @param reduce_fnc     reducer function
 * @param acc_fnc        optional accumulator function
 * @param zero           optional the zero value for the accumulator
 * @param X              left data set/*from  w w w. ja v  a 2s.c  om*/
 * @param Y              right data set
 * @param num_reduces    number of reducers
 * @param stop_counter   optional counter used in repeat operation
 * @param orderp         does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce2(Tree mx, // left mapper function
        Tree my, // right mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet X, // left data set
        DataSet Y, // right data set
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper.left", mx.toString());
    conf.set("mrql.mapper.right", my.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) {
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(new DataSet[] { X, Y }, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setMapOutputKeyClass(JoinKey.class);
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerJoinPartitioner.class);
    job.setSortComparatorClass(MRContainerSortComparator.class);
    job.setGroupingComparatorClass(MRContainerGroupingComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    for (DataSource p : X.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperLeft.class);
    for (DataSource p : Y.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MapperRight.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce2_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce2 reducer");
    job.setReducerClass(JoinReducer.class);
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}

From source file:org.apache.mrql.MapReduceOperation.java

License:Apache License

/**
 * The MapReduce physical operator/*from  w  w w .j a  va  2 s. c o  m*/
 * @param map_fnc          the mapper function
 * @param combine_fnc      optional in-mapper combiner function
 * @param reduce_fnc       the reducer function
 * @param acc_fnc          optional accumulator function
 * @param zero             optional the zero value for the accumulator
 * @param source           the input data source
 * @param num_reduces      number of reducers
 * @param stop_counter     optional counter used in repeat operation
 * @param orderp           does the result need to be ordered?
 * @return a new data source that contains the result
 */
public final static DataSet mapReduce(Tree map_fnc, // mapper function
        Tree combine_fnc, // optional in-mapper combiner function
        Tree reduce_fnc, // reducer function
        Tree acc_fnc, // optional accumulator function
        Tree zero, // optional the zero value for the accumulator
        DataSet source, // input data source
        int num_reduces, // number of reducers
        String stop_counter, // optional counter used in repeat operation
        boolean orderp) // does the result need to be ordered?
        throws Exception {
    conf = MapReduceEvaluator.clear_configuration(conf);
    String newpath = new_path(conf);
    conf.set("mrql.mapper", map_fnc.toString());
    if (combine_fnc != null)
        conf.set("mrql.combiner", combine_fnc.toString());
    conf.set("mrql.reducer", reduce_fnc.toString());
    if (zero != null) { // will use in-mapper combiner
        conf.set("mrql.accumulator", acc_fnc.toString());
        conf.set("mrql.zero", zero.toString());
    } else
        conf.set("mrql.zero", "");
    conf.set("mrql.counter", stop_counter);
    setupSplits(source, conf);
    Job job = new Job(conf, newpath);
    distribute_compiled_arguments(job.getConfiguration());
    job.setJarByClass(MapReducePlan.class);
    job.setOutputKeyClass(MRContainer.class);
    job.setOutputValueClass(MRContainer.class);
    job.setPartitionerClass(MRContainerPartitioner.class);
    job.setSortComparatorClass(MRContainerKeyComparator.class);
    job.setGroupingComparatorClass(MRContainerKeyComparator.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    for (DataSource p : source.source)
        MultipleInputs.addInputPath(job, new Path(p.path),
                (Class<? extends MapReduceMRQLFileInputFormat>) p.inputFormat, MRMapper.class);
    FileOutputFormat.setOutputPath(job, new Path(newpath));
    job.setReducerClass(MRReducer.class);
    if (Config.trace && PlanGeneration.streamed_MapReduce_reducer(reduce_fnc))
        System.out.println("Streamed MapReduce reducer");
    if (num_reduces > 0)
        job.setNumReduceTasks(num_reduces);
    job.waitForCompletion(true);
    long c = (stop_counter.equals("-")) ? 0 : job.getCounters().findCounter("mrql", stop_counter).getValue();
    DataSource s = new BinaryDataSource(newpath, conf);
    s.to_be_merged = orderp;
    return new DataSet(s, c, outputRecords(job));
}

From source file:org.apache.rya.accumulo.mr.merge.CopyTool.java

License:Apache License

private int runQueryCopy() throws Exception {
    log.info("Setting up Copy Tool with a query-based ruleset...");
    setup();/*  w  w w.  j  a  v  a2 s.  c  om*/
    if (!useCopyFileOutput) {
        createChildInstance(conf);
    }

    // Set up the configuration
    final AccumuloRdfConfiguration aconf = new AccumuloRdfConfiguration(conf);
    aconf.setBoolean(ConfigUtils.USE_MOCK_INSTANCE, mock);
    aconf.setTablePrefix(tablePrefix);
    aconf.setFlush(false);
    ConfigUtils.setIndexers(aconf);

    // Since we're copying at the statement-level, ignore any given list of tables and determine
    // which tables we might need to create based on which indexers are desired.
    final TablePrefixLayoutStrategy prefixStrategy = new TablePrefixLayoutStrategy(tablePrefix);
    tables.clear();
    // Always include core tables
    tables.add(prefixStrategy.getSpo());
    tables.add(prefixStrategy.getOsp());
    tables.add(prefixStrategy.getPo());
    // Copy namespaces if they exist
    tables.add(prefixStrategy.getNs());
    // Add tables associated with any configured indexers
    /* TODO: SEE RYA-160
    if (aconf.getBoolean(ConfigUtils.USE_FREETEXT, false)) {
    tables.add(ConfigUtils.getFreeTextDocTablename(conf));
    tables.add(ConfigUtils.getFreeTextTermTablename(conf));
    }
    if (aconf.getBoolean(ConfigUtils.USE_GEO, false)) {
    tables.add(ConfigUtils.getGeoTablename(conf));
    }
    if (aconf.getBoolean(ConfigUtils.USE_TEMPORAL, false)) {
    tables.add(ConfigUtils.getTemporalTableName(conf));
    }
    if (aconf.getBoolean(ConfigUtils.USE_ENTITY, false)) {
    tables.add(ConfigUtils.getEntityTableName(conf));
    }
    */
    // Ignore anything else, e.g. statistics -- must be recalculated for the child if desired

    // Extract the ruleset, and copy the namespace table directly
    final AccumuloQueryRuleset ruleset = new AccumuloQueryRuleset(aconf);
    ruleset.addTable(prefixStrategy.getNs());
    for (final String line : ruleset.toString().split("\n")) {
        log.info(line);
    }

    // Create a Job and configure its input and output
    final Job job = Job.getInstance(aconf);
    job.setJarByClass(this.getClass());
    setupMultiTableInputFormat(job, ruleset);
    setupAccumuloOutput(job, "");

    if (useCopyFileOutput) {
        // Configure job for file output
        job.setJobName("Ruleset-based export to file: " + tablePrefix + " -> " + localBaseOutputDir);
        // Map (row) to (table+key, key+value)
        job.setMapperClass(RowRuleMapper.class);
        job.setMapOutputKeyClass(GroupedRow.class);
        job.setMapOutputValueClass(GroupedRow.class);
        // Group according to table and and sort according to key
        job.setGroupingComparatorClass(GroupedRow.GroupComparator.class);
        job.setSortComparatorClass(GroupedRow.SortComparator.class);
        // Reduce ([table+row], rows): output each row to the file for that table, in sorted order
        job.setReducerClass(MultipleFileReducer.class);
        job.setOutputKeyClass(Key.class);
        job.setOutputValueClass(Value.class);
    } else {
        // Configure job for table output
        job.setJobName("Ruleset-based copy: " + tablePrefix + " -> " + childTablePrefix);
        // Map (row): convert to statement, insert to child (for namespace table, output row directly)
        job.setMapperClass(AccumuloRyaRuleMapper.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Mutation.class);
        job.setNumReduceTasks(0);
        // Create the child tables, so mappers don't try to do this in parallel
        for (final String parentTable : tables) {
            final String childTable = parentTable.replaceFirst(tablePrefix, childTablePrefix);
            createTableIfNeeded(childTable);
        }
    }

    // Run the job and copy files to local filesystem if needed
    final Date beginTime = new Date();
    log.info("Job started: " + beginTime);
    final boolean success = job.waitForCompletion(true);
    if (success) {
        if (useCopyFileOutput) {
            log.info("Moving data from HDFS to the local file system");
            final Path baseOutputPath = new Path(baseOutputDir);
            for (final FileStatus status : FileSystem.get(conf).listStatus(baseOutputPath)) {
                if (status.isDirectory()) {
                    final String tableName = status.getPath().getName();
                    final Path hdfsPath = getPath(baseOutputDir, tableName);
                    final Path localPath = getPath(localBaseOutputDir, tableName);
                    log.info("HDFS directory: " + hdfsPath.toString());
                    log.info("Local directory: " + localPath.toString());
                    copyHdfsToLocal(hdfsPath, localPath);
                }
            }
        }
        final Date endTime = new Date();
        log.info("Job finished: " + endTime);
        log.info("The job took " + (endTime.getTime() - beginTime.getTime()) / 1000 + " seconds.");
        return 0;
    } else {
        log.error("Job failed!!!");
        return 1;
    }
}

From source file:org.apache.tinkerpop.gremlin.hadoop.process.computer.util.MapReduceHelper.java

License:Apache License

public static void executeMapReduceJob(final MapReduce mapReduce, final Memory.Admin memory,
        final Configuration configuration) throws IOException, ClassNotFoundException, InterruptedException {
    final Configuration newConfiguration = new Configuration(configuration);
    final boolean vertexProgramExists = newConfiguration.get(VertexProgram.VERTEX_PROGRAM, null) != null;
    if (vertexProgramExists) {
        newConfiguration.set(Constants.GREMLIN_HADOOP_GRAPH_READER, InputOutputHelper.getInputFormat(
                (Class) newConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class))
                .getCanonicalName());/*from  w  w  w . java  2  s.c om*/
        newConfiguration.unset(Constants.GREMLIN_HADOOP_GRAPH_FILTER);
    }
    final BaseConfiguration apacheConfiguration = new BaseConfiguration();
    apacheConfiguration.setDelimiterParsingDisabled(true);
    mapReduce.storeState(apacheConfiguration);
    ConfUtil.mergeApacheIntoHadoopConfiguration(apacheConfiguration, newConfiguration);

    final Optional<Comparator<?>> mapSort = mapReduce.getMapKeySort();
    final Optional<Comparator<?>> reduceSort = mapReduce.getReduceKeySort();
    newConfiguration.setClass(Constants.GREMLIN_HADOOP_MAP_REDUCE_CLASS, mapReduce.getClass(), MapReduce.class);
    final Job job = Job.getInstance(newConfiguration, mapReduce.toString());
    HadoopGraph.LOGGER.info(Constants.GREMLIN_HADOOP_JOB_PREFIX + mapReduce.toString());
    job.setJarByClass(HadoopGraph.class);
    if (mapSort.isPresent())
        job.setSortComparatorClass(ObjectWritableComparator.ObjectWritableMapComparator.class);
    job.setMapperClass(HadoopMap.class);
    if (mapReduce.doStage(MapReduce.Stage.REDUCE)) {
        if (mapReduce.doStage(MapReduce.Stage.COMBINE))
            job.setCombinerClass(HadoopCombine.class);
        job.setReducerClass(HadoopReduce.class);
    } else {
        if (mapSort.isPresent()) {
            job.setReducerClass(Reducer.class);
            job.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        } else {
            job.setNumReduceTasks(0);
        }
    }
    job.setMapOutputKeyClass(ObjectWritable.class);
    job.setMapOutputValueClass(ObjectWritable.class);
    job.setOutputKeyClass(ObjectWritable.class);
    job.setOutputValueClass(ObjectWritable.class);
    job.setInputFormatClass(GraphFilterInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    // if there is no vertex program, then grab the graph from the input location
    final Path graphPath;
    if (vertexProgramExists) {
        graphPath = new Path(
                Constants.getGraphLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION)));
    } else {
        graphPath = new Path(newConfiguration.get(Constants.GREMLIN_HADOOP_INPUT_LOCATION));
    }

    Path memoryPath = new Path(
            Constants.getMemoryLocation(newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION),
                    (reduceSort.isPresent() ? mapReduce.getMemoryKey() + "-temp" : mapReduce.getMemoryKey())));
    if (FileSystem.get(newConfiguration).exists(memoryPath)) {
        FileSystem.get(newConfiguration).delete(memoryPath, true);
    }
    FileInputFormat.setInputPaths(job, graphPath);
    FileOutputFormat.setOutputPath(job, memoryPath);
    job.waitForCompletion(true);

    // if there is a reduce sort, we need to run another identity MapReduce job
    if (reduceSort.isPresent()) {
        final Job reduceSortJob = Job.getInstance(newConfiguration, "ReduceKeySort");
        reduceSortJob.setSortComparatorClass(ObjectWritableComparator.ObjectWritableReduceComparator.class);
        reduceSortJob.setMapperClass(Mapper.class);
        reduceSortJob.setReducerClass(Reducer.class);
        reduceSortJob.setMapOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setMapOutputValueClass(ObjectWritable.class);
        reduceSortJob.setOutputKeyClass(ObjectWritable.class);
        reduceSortJob.setOutputValueClass(ObjectWritable.class);
        reduceSortJob.setInputFormatClass(SequenceFileInputFormat.class);
        reduceSortJob.setOutputFormatClass(SequenceFileOutputFormat.class);
        reduceSortJob.setNumReduceTasks(1); // todo: is this necessary to ensure sorted order?
        FileInputFormat.setInputPaths(reduceSortJob, memoryPath);
        final Path sortedMemoryPath = new Path(Constants.getMemoryLocation(
                newConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION), mapReduce.getMemoryKey()));
        FileOutputFormat.setOutputPath(reduceSortJob, sortedMemoryPath);
        reduceSortJob.waitForCompletion(true);
        FileSystem.get(newConfiguration).delete(memoryPath, true); // delete the temporary memory path
        memoryPath = sortedMemoryPath;
    }
    mapReduce.addResultToMemory(memory, new ObjectWritableIterator(newConfiguration, memoryPath));
}

From source file:org.clueweb.clueweb12.app.BuildDictionary.java

License:Apache License

/**
 * Runs this tool./*from w ww .  j  ava 2 s  .  com*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("num").hasArg().withDescription("number of terms").create(COUNT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)
            || !cmdline.hasOption(COUNT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);

    LOG.info("Tool name: " + ComputeTermStatistics.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);

    Configuration conf = getConf();

    conf.set(HADOOP_OUTPUT_OPTION, output);
    conf.setInt(HADOOP_TERMS_COUNT_OPTION, Integer.parseInt(cmdline.getOptionValue(COUNT_OPTION)));
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");

    Job job = new Job(conf, BuildDictionary.class.getSimpleName() + ":" + input);

    job.setJarByClass(BuildDictionary.class);
    job.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(job, new Path(input));
    FileOutputFormat.setOutputPath(job, new Path(output));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(NullOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setSortComparatorClass(DictionaryTransformationStrategy.WritableComparator.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(MyReducer.class);

    FileSystem.get(getConf()).delete(new Path(output), true);
    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}