Example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setSortComparatorClass.

Prototype

public void setSortComparatorClass(Class<? extends RawComparator> cls) throws IllegalStateException

Source Link

Document

Define the comparator that controls how the keys are sorted before they are passed to the Reducer .

Usage

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java

License:Open Source License

public int run(String[] args) throws IOException {
    Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: ca.uwaterloo.iss4e.hadoop.pointperrow.ConsineMain <input> <output>");
        System.exit(2);//  w w  w.ja v  a 2  s  . co m
    }
    Job job1 = new Job(conf, "ConsineMain");
    job1.setJarByClass(CosineMain.class);

    job1.setMapperClass(AggregateReadingsMapper.class);
    job1.setMapOutputKeyClass(LongWritable.class);
    job1.setMapOutputValueClass(DoubleWritable.class);

    job1.setReducerClass(AggregateReadingsReducer.class);
    job1.setOutputKeyClass(LongWritable.class);
    job1.setOutputValueClass(Text.class);
    FileInputFormat.setInputDirRecursive(job1, true);
    FileInputFormat.setInputPaths(job1, new Path(otherArgs[0]));
    int lastIdx = otherArgs[0].lastIndexOf("/");
    String tempOutput = otherArgs[0].substring(0, lastIdx) + "/temp";
    FileOutputFormat.setOutputPath(job1, new Path(tempOutput));

    System.out.println("\nStarting Job-1 ...");
    final long startTime = System.currentTimeMillis();
    try {
        final long startTimeJob1 = System.currentTimeMillis();
        if (!job1.waitForCompletion(true)) {
            System.out.println("Job-1 failed.");
        } else {
            System.out.println("Duration of Job1 " + ((System.currentTimeMillis() - startTimeJob1) / 1000.0)
                    + " seconds.");
            final Job job2 = new Job(conf, "ConsineMain Aggregate");
            job2.setJarByClass(CosineMain.class);
            job2.setInputFormatClass(CartesianInputFormat.class);
            CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, tempOutput);
            CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, tempOutput);
            FileOutputFormat.setOutputPath(job2, new Path(otherArgs[1]));

            job2.setMapperClass(CartesianProductMapper.class);
            job2.setMapOutputKeyClass(DoubleWritable.class);
            job2.setMapOutputValueClass(Text.class);

            job2.setSortComparatorClass(DescendingKeyComparator.class);

            job2.setReducerClass(CartesianProductReducer.class);
            job2.setOutputKeyClass(Text.class);
            job2.setOutputValueClass(DoubleWritable.class);

            job2.setNumReduceTasks(10);
            final long startTimeJob2 = System.currentTimeMillis();
            System.out.println("\nStarting Job-2 ...");
            if (!job2.waitForCompletion(true)) {
                System.out.println("Job-2 failed.");
            } else {
                System.out.println("Duration of Job2: "
                        + ((System.currentTimeMillis() - startTimeJob2) / 1000.0) + " seconds.");
            }

        }
        FileSystem fs = FileSystem.get(conf);
        fs.delete(new Path(tempOutput), true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Total Duration: " + duration + " seconds.");
    }
    return 0;
}

From source file:ca.uwaterloo.iss4e.hadoop.pointperrow.CosineMain.java

License:Open Source License

public int run1(String[] args) throws IOException {
    if (args.length != 3) {
        System.err.println("Usage: java " + getClass().getName() + " <inputDir> <outDir> <ntasks>");
        ToolRunner.printGenericCommandUsage(System.err);
        return -1;
    }/*from  w  w  w .  j  a  va  2  s  .c o  m*/
    Configuration conf = getConf();
    final Job job2 = new Job(conf, "ConsineMain cartesian product");
    job2.setJarByClass(CosineMain.class);

    job2.setInputFormatClass(CartesianInputFormat.class);
    CartesianInputFormat.setLeftInputInfo(job2, TextInputFormat.class, args[0]);
    CartesianInputFormat.setRightInputInfo(job2, TextInputFormat.class, args[0]);
    FileOutputFormat.setOutputPath(job2, new Path(args[1]));

    job2.setMapperClass(CartesianProductMapper.class);
    job2.setMapOutputKeyClass(DoubleWritable.class);
    job2.setMapOutputValueClass(Text.class);

    job2.setSortComparatorClass(DescendingKeyComparator.class);

    job2.setReducerClass(CartesianProductReducer.class);
    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(DoubleWritable.class);
    job2.setNumReduceTasks(Integer.parseInt(args[2]));

    System.out.println("\nStarting Job-2 ...");
    final long startTime = System.currentTimeMillis();
    try {
        if (!job2.waitForCompletion(true)) {
            System.out.println("Job-2 failed.");
            System.exit(1);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    } finally {
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Duration is " + duration + " seconds.");
    }
    return 0;
}

From source file:com.asakusafw.runtime.mapreduce.simple.SimpleJobRunnerTest.java

License:Apache License

/**
 * Test for map-reduce job./*from  ww w .ja v a 2  s.  com*/
 * @throws Exception if failed
 */
@Test
public void map_reduce() throws Exception {
    Job job = newJob();
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(WordCountMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setSortComparatorClass(Text.Comparator.class);
    job.setGroupingComparatorClass(Text.Comparator.class);

    job.setReducerClass(WordCountReducer.class);
    job.setNumReduceTasks(1);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    File inputDir = folder.newFolder();
    File inputFile = new File(inputDir, "input.txt");
    write(inputFile, new String[] { "a b c d", "a a b c", "c", });

    File outputDir = folder.newFolder();
    outputDir.delete();

    FileInputFormat.setInputPaths(job, new Path(inputFile.toURI()));
    FileOutputFormat.setOutputPath(job, new Path(outputDir.toURI()));
    assertThat(new SimpleJobRunner().run(job), is(true));
    assertThat(toMap(read(outputDir)), is(map(new String[] { "a", "3", "b", "2", "c", "3", "d", "1", })));
}

From source file:com.asakusafw.runtime.stage.AbstractStageClient.java

License:Apache License

@SuppressWarnings("rawtypes")
private void configureShuffle(Job job, VariableTable variables) {
    Class<? extends Reducer> reducer = getReducerClassOrNull();
    if (reducer != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("Reducer: {0}", reducer.getName())); //$NON-NLS-1$
        }/*w  w  w . ja v  a 2  s.  c  o  m*/
        job.setReducerClass(reducer);
    } else {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Reducer: N/A"); //$NON-NLS-1$
        }
        job.setNumReduceTasks(0);
        return;
    }

    Class<? extends Writable> outputKeyClass = or(getShuffleKeyClassOrNull(), NullWritable.class);
    Class<? extends Writable> outputValueClass = or(getShuffleValueClassOrNull(), NullWritable.class);
    if (LOG.isDebugEnabled()) {
        LOG.debug(MessageFormat.format("Shuffle: key={0}, value={1}", //$NON-NLS-1$
                outputKeyClass.getName(), outputValueClass.getName()));
    }
    job.setMapOutputKeyClass(outputKeyClass);
    job.setMapOutputValueClass(outputValueClass);

    Class<? extends Reducer> combiner = getCombinerClassOrNull();
    if (combiner != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("Combiner: {0}", combiner.getName())); //$NON-NLS-1$
        }
        job.setCombinerClass(combiner);
    } else {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Combiner: N/A"); //$NON-NLS-1$
        }
    }

    Class<? extends Partitioner> partitioner = getPartitionerClassOrNull();
    if (partitioner != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("Partitioner: {0}", partitioner.getName())); //$NON-NLS-1$
        }
        job.setPartitionerClass(partitioner);
    } else {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Partitioner: DEFAULT"); //$NON-NLS-1$
        }
    }

    Class<? extends RawComparator> groupingComparator = getGroupingComparatorClassOrNull();
    if (groupingComparator != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("GroupingComparator: {0}", groupingComparator.getName())); //$NON-NLS-1$
        }
        job.setGroupingComparatorClass(groupingComparator);
    } else {
        if (LOG.isDebugEnabled()) {
            LOG.debug("GroupingComparator: DEFAULT"); //$NON-NLS-1$
        }
    }

    Class<? extends RawComparator> sortComparator = getSortComparatorClassOrNull();
    if (sortComparator != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug(MessageFormat.format("SortComparator: {0}", sortComparator.getName())); //$NON-NLS-1$
        }
        job.setSortComparatorClass(sortComparator);
    } else {
        if (LOG.isDebugEnabled()) {
            LOG.debug("SortComparator: DEFAULT"); //$NON-NLS-1$
        }
    }
}

From source file:com.asakusafw.thundergate.runtime.cache.mapreduce.CacheBuildClient.java

License:Apache License

private void updateMerge() throws IOException, InterruptedException {
    Job job = newJob();

    List<StageInput> inputList = new ArrayList<>();
    inputList.add(new StageInput(storage.getHeadContents("*").toString(), TemporaryInputFormat.class,
            MergeJoinBaseMapper.class));
    inputList.add(new StageInput(storage.getPatchContents("*").toString(), TemporaryInputFormat.class,
            MergeJoinPatchMapper.class));
    StageInputDriver.set(job, inputList);
    job.setInputFormatClass(StageInputFormat.class);
    job.setMapperClass(StageInputMapper.class);
    job.setMapOutputKeyClass(PatchApplyKey.class);
    job.setMapOutputValueClass(modelClass);

    // combiner may have no effect in normal cases
    job.setReducerClass(MergeJoinReducer.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(modelClass);
    job.setPartitionerClass(PatchApplyKey.Partitioner.class);
    job.setSortComparatorClass(PatchApplyKey.SortComparator.class);
    job.setGroupingComparatorClass(PatchApplyKey.GroupComparator.class);

    TemporaryOutputFormat.setOutputPath(job, getNextDirectory());
    job.setOutputFormatClass(TemporaryOutputFormat.class);
    job.getConfiguration().setClass("mapred.output.committer.class", LegacyBridgeOutputCommitter.class,
            org.apache.hadoop.mapred.OutputCommitter.class);

    LOG.info(MessageFormat.format("applying patch (merge join): {0} / {1} -> {2}",
            storage.getPatchContents("*"), storage.getHeadContents("*"), getNextContents()));
    try {/*from w ww. j  a va2 s. c om*/
        boolean succeed = job.waitForCompletion(true);
        LOG.info(MessageFormat.format("applied patch (merge join): succeed={0}, {1} / {2} -> {3}", succeed,
                storage.getPatchContents("*"), storage.getHeadContents("*"), getNextContents()));
        if (succeed == false) {
            throw new IOException(MessageFormat.format("failed to apply patch (merge join): {0} / {1} -> {2}",
                    storage.getPatchContents("*"), storage.getHeadContents("*"), getNextContents()));
        }
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
    putMeta();
}

From source file:com.bark.hadoop.lab3.PageRank.java

@Override
public int run(String args[]) {
    String tmp = "/tmp/" + new Date().getTime();
    //        long timeStamp = new Date().getTime();
    try {/*from  w w  w.j ava 2 s .c  o m*/
        /**
         * Job 1: Parse XML input and read title,links
         */
        Configuration conf = new Configuration();
        conf.set("xmlinput.start", "<page>");
        conf.set("xmlinput.end", "</page>");

        Job job = Job.getInstance(conf);
        job.setJarByClass(PageRank.class);

        // specify a mapper
        job.setMapperClass(RedLinkMapper.class);

        // specify a reducer
        job.setReducerClass(RedLinkReducer.class);

        // specify output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(XmlInputFormat.class);

        FileOutputFormat.setOutputPath(job, new Path((args[1] + tmp + "/job1")));
        job.setOutputFormatClass(TextOutputFormat.class);

        job.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job1.");
        return 2;
    }
    /**
     * Job 2: Adjacency outGraph
     */
    try {
        Configuration conf2 = new Configuration();

        Job job2 = Job.getInstance(conf2);
        job2.setJarByClass(PageRank.class);

        // specify a mapper
        job2.setMapperClass(AdjMapper.class);

        // specify a reducer
        job2.setReducerClass(AdjReducer.class);

        // specify output types
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job2, new Path((args[1] + tmp + "/job1")));
        job2.setInputFormatClass(TextInputFormat.class);

        FileOutputFormat.setOutputPath(job2, new Path((args[1] + tmp + "/job2")));
        job2.setOutputFormatClass(TextOutputFormat.class);

        job2.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job2.");
        return 2;
    }
    /**
     * Job 3: PageCount
     */
    try {
        Configuration conf3 = new Configuration();
        /**
         * Change output separator to "=" instead of default \t for this job
         */
        conf3.set("mapreduce.output.textoutputformat.separator", "=");

        Job job3 = Job.getInstance(conf3);
        job3.setJarByClass(PageRank.class);

        // specify a mapper
        job3.setMapperClass(PageCountMapper.class);

        // specify a reducer
        job3.setReducerClass(PageCountReducer.class);

        // specify output types
        job3.setOutputKeyClass(Text.class);
        job3.setOutputValueClass(IntWritable.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job3, new Path((args[1] + tmp + "/job2")));
        job3.setInputFormatClass(TextInputFormat.class);

        FileOutputFormat.setOutputPath(job3, new Path((args[1] + tmp + "/job3")));
        job3.setOutputFormatClass(TextOutputFormat.class);

        job3.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job3.");
        return 2;
    }
    /**
     * Job 4: PageRank
     */
    for (int i = 1; i < 9; i++) {
        try {
            Configuration conf4 = new Configuration();
            /**
             * Read number of nodes from the output of job 3 : pageCount
             */
            Path path = new Path((args[1] + tmp + "/job3"));
            FileSystem fs = path.getFileSystem(conf4);
            RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true);

            int n = 0;
            Pattern pt = Pattern.compile("(\\d+)");
            while (ri.hasNext()) {
                LocatedFileStatus lfs = ri.next();
                if (lfs.isFile() && n == 0) {
                    FSDataInputStream inputStream = fs.open(lfs.getPath());
                    BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                    String s = null;
                    while ((s = br.readLine()) != null) {
                        Matcher mt = pt.matcher(s);
                        if (mt.find()) {
                            n = new Integer(mt.group(1));
                            break;
                        }
                    }
                }
            }
            /**
             * Done reading number of nodes, make it available to MapReduce
             * job key: N
             */
            conf4.setInt("N", n);

            Job job4 = Job.getInstance(conf4);
            job4.setJarByClass(PageRank.class);

            // specify a mapper
            job4.setMapperClass(PageRankMapper.class);

            // specify a reducer
            job4.setReducerClass(PageRankReducer.class);

            // specify output types
            job4.setOutputKeyClass(Text.class);
            job4.setOutputValueClass(Text.class);

            // specify input and output DIRECTORIES
            if (i == 1) {
                FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job2")));
            } else {
                FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job4/" + (i - 1))));
            }
            job4.setInputFormatClass(TextInputFormat.class);

            FileOutputFormat.setOutputPath(job4, new Path((args[1] + tmp + "/job4/" + i)));
            job4.setOutputFormatClass(TextOutputFormat.class);
            job4.waitForCompletion(true);
        } catch (InterruptedException | ClassNotFoundException | IOException ex) {
            Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
            System.err.println("Error during mapreduce job4.");
            return 2;
        }
    }
    /**
     * Job 5: Sort iteration 1 and iteration 8
     */
    int returnCode = 0;
    for (int i = 0; i < 2; i++) {
        try {
            Configuration conf5 = new Configuration();

            /**
             * Read number of nodes from the output of job 3 : pageCount
             */
            Path path = new Path((args[1] + tmp + "/job3"));
            FileSystem fs = path.getFileSystem(conf5);
            RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true);

            int n = 0;
            Pattern pt = Pattern.compile("(\\d+)");
            while (ri.hasNext()) {
                LocatedFileStatus lfs = ri.next();
                if (lfs.isFile() && n == 0) {
                    FSDataInputStream inputStream = fs.open(lfs.getPath());
                    BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                    String s = null;
                    while ((s = br.readLine()) != null) {
                        Matcher mt = pt.matcher(s);
                        if (mt.find()) {
                            n = new Integer(mt.group(1));
                            break;
                        }
                    }
                }
            }
            /**
             * Done reading number of nodes, make it available to MapReduce
             * job key: N
             */
            conf5.setInt("N", n);

            Job job5 = Job.getInstance(conf5);
            /**
             * one reducer only
             */
            job5.setNumReduceTasks(1);
            job5.setSortComparatorClass(MyWritableComparator.class);
            job5.setJarByClass(PageRank.class);

            // specify a mapper
            job5.setMapperClass(SortMapper.class);
            job5.setMapOutputKeyClass(DoubleWritable.class);
            job5.setMapOutputValueClass(Text.class);

            // specify a reducer
            job5.setReducerClass(SortReducer.class);

            // specify output types
            job5.setOutputKeyClass(Text.class);
            job5.setOutputValueClass(DoubleWritable.class);

            // specify input and output DIRECTORIES
            int y = 7 * i + 1;
            FileInputFormat.addInputPath(job5, new Path((args[1] + tmp + "/job4/" + y)));
            job5.setInputFormatClass(TextInputFormat.class);

            FileOutputFormat.setOutputPath(job5, new Path((args[1] + tmp + "/job5/" + y)));
            job5.setOutputFormatClass(TextOutputFormat.class);

            returnCode = job5.waitForCompletion(true) ? 0 : 1;
        } catch (InterruptedException | ClassNotFoundException | IOException ex) {
            Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
            System.err.println("Error during mapreduce job5.");
            return 2;
        }
    }
    /**
     * Copy necessary output files to args[1]        /**
     * Copy necessary output files to args[1]
     */

    /**
     * Rename and copy OutLinkGraph
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job2/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.outlink.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy total number of pages
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job3/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.n.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy iteration 1
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job5/1/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.iter1.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy iteration 8
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job5/8/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.iter8.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }
    return returnCode;
}

From source file:com.bigfishgames.biginsights.upsight.mapreduce.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }/*  w  w  w. jav  a 2 s  .  c  o m*/

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    // AvroJob.setOutputKeySchema(job,
    //                         Pair.getPairSchema(Schema.create(Type.STRING),
    //                                           Schema.create(Type.NULL)));
    AvroJob.setOutputKeySchema(job, Event.getClassSchema());

    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MyAvroKeyOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:com.bizosys.hsearch.kv.indexing.KVIndexer.java

License:Apache License

private static int runJob(int jobTypeI, Job job, FieldMapping fm, String input, String output,
        int scannerCacheSize, String filter) throws IOException, InterruptedException, ClassNotFoundException {

    int jobStatus = -1;

    switch (jobTypeI) {
    case SF2HB: {

        IdSearchLog.l.info("Starting Job for SF2HB input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);
        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }//from w  ww  .  j  a va2 s  .co  m
    case SF2HF: {

        //First creates map file and then convert to hfile.
        //create intermediate dir for map file output

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for SF2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("SF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case SF2MF: {

        IdSearchLog.l.info("Starting Job for SF2MF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and output folder " + output);

        FileInputFormat.addInputPath(job, new Path(input));

        job.setMapperClass(KVMapperFile.class);
        job.setInputFormatClass(TextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);

        job.setSortComparatorClass(TextPair.FirstComparator.class);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case MF2HB: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerHBase.class);
        TableMapReduceUtil.initTableReducerJob(fm.tableName, KVReducerHBase.class, job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case MF2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("MF2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case MF2MF: {

        job.setMapperClass(KVMapperMapFile.class);
        job.setInputFormatClass(SequenceFileAsTextInputFormat.class);
        job.setMapOutputKeyClass(TextPair.class);
        job.setMapOutputValueClass(Text.class);
        SequenceFileAsTextInputFormat.addInputPath(job, new Path(input));

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case HB2HB: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());
        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        TableMapReduceUtil.initTableReducerJob(fm.tableName, // output table
                KVReducerHBase.class, // reducer class
                job);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;

    }
    case HB2HF: {

        String intermediateFolder = output + "_intermediate";
        Path intermediateOutpurDir = new Path(intermediateFolder);

        IdSearchLog.l.info("Starting Job for HB2HF input field separator " + KVIndexer.FIELD_SEPARATOR
                + " using hbase table : " + fm.tableName + " and intremediate output folder "
                + intermediateFolder + " final output dir " + output);

        //reset the output folder to intermediate folder
        Configuration conf = job.getConfiguration();
        conf.set(OUTPUT_FOLDER, intermediateFolder);
        int jobT = JobTypeMapping.get("HB2MF");
        jobStatus = runJob(jobT, job, fm, input, intermediateFolder, scannerCacheSize, filter);

        if (jobStatus == 0) {

            Configuration hfileConf = HBaseConfiguration.create();
            hfileConf.set(XML_FILE_PATH, conf.get(XML_FILE_PATH));
            Job hfileJob = Job.getInstance(hfileConf, "Creating Hfile");
            String dataInputPath = intermediateFolder + "/" + MapFile.DATA_FILE_NAME;
            jobT = JobTypeMapping.get("IMF2HF");
            jobStatus = runJob(jobT, hfileJob, fm, dataInputPath, output, scannerCacheSize, filter);
        }

        //delete intermediate dir
        FileSystem.get(conf).delete(intermediateOutpurDir, true);
        //delete the empty _SUCCESS folder
        FileSystem.get(conf).delete(new Path(output, "_SUCCESS"), true);

        return jobStatus;
    }
    case HB2MF: {

        if (fm.tableName.equals(input)) {
            throw new IOException("Input table and index table can not be same");
        }

        Scan scan = new Scan();
        scan.setCaching(scannerCacheSize);
        scan.setCacheBlocks(false);
        scan.addFamily(fm.familyName.getBytes());

        if (null != filter) {
            if (filter.trim().length() > 0) {
                int index = filter.indexOf('=');
                scan.setFilter(new SingleColumnValueFilter(fm.familyName.getBytes(),
                        filter.substring(0, index).getBytes(), CompareOp.EQUAL,
                        filter.substring(index + 1).getBytes()));
            }
        }

        TableMapReduceUtil.initTableMapperJob(input, // input table
                scan, // Scan instance to control CF and attribute selection
                KVMapperHBase.class, // mapper class
                TextPair.class, // mapper output key
                Text.class, // mapper output value
                job);

        job.setReducerClass(KVReducerMapFile.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(ImmutableBytesWritable.class);
        LazyOutputFormat.setOutputFormatClass(job, NullOutputFormat.class);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }
    case IMF2HF: {

        Path finalOutputDir = new Path(output);
        job.setJarByClass(KVIndexer.class);
        job.setMapperClass(KVMapperHFile.class);

        job.setInputFormatClass(SequenceFileInputFormat.class);
        SequenceFileInputFormat.addInputPath(job, new Path(input));
        FileOutputFormat.setOutputPath(job, finalOutputDir);

        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(KeyValue.class);

        HTable hTable = new HTable(job.getConfiguration(), fm.tableName);
        HFileOutputFormat.configureIncrementalLoad(job, hTable);

        jobStatus = job.waitForCompletion(true) ? 0 : 1;
        return jobStatus;
    }

    default:
        throw new IOException("Invalid Jobtype " + jobTypeI);
    }
}

From source file:com.ci.backports.avro.mapreduce.AvroJob.java

License:Apache License

/** Configure a job's map output key schema. */
public static void setMapOutputKeySchema(Job job, Schema schema) {
    job.setMapOutputKeyClass(AvroKey.class);
    job.getConfiguration().set(KEY_MAP_OUTPUT_SCHEMA_CONFIG_FIELD, schema.toString());
    job.setGroupingComparatorClass(AvroKeyComparator.class);
    job.setSortComparatorClass(AvroKeyComparator.class);
    addAvroSerialization(job.getConfiguration());
}

From source file:com.cloudera.avro.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }/*  w ww  . j  a  v a  2  s.  c  o  m*/

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    AvroJob.setOutputKeySchema(job, Pair.getPairSchema(Schema.create(Type.STRING), Schema.create(Type.INT)));
    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}