Example usage for org.apache.hadoop.mapreduce Job setJarByClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setJarByClass.

Prototype

public void setJarByClass(Class<?> cls)

Source Link

Document

Set the Jar by finding where a given class came from.

Usage

From source file:com.baidu.cloud.bmr.mapreduce.AccessLogAnalyzer.java

License:Open Source License

public static void main(String[] args) {
    Configuration conf = new Configuration();
    if (args.length != 2) {
        System.err.println("Usage: AccessLogAnalyzer <input path> <output path>");
        System.exit(-1);/*from  w  ww .  j  a  va  2  s  . com*/
    }
    String inputPath = args[0];
    String outputPath = args[1];
    try {
        Job job = new Job(conf, "AccessLogAnalyzer");
        job.setJarByClass(AccessLogAnalyzer.class);
        job.setMapperClass(AccessLogAnalyzerMapper.class);
        job.setReducerClass(AccessLogAnalyzerReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    } catch (IOException | ClassNotFoundException | InterruptedException e) {
    }
}

From source file:com.bark.hadoop.lab3.PageRank.java

@Override
public int run(String args[]) {
    String tmp = "/tmp/" + new Date().getTime();
    //        long timeStamp = new Date().getTime();
    try {//from   w ww . j  a  va  2  s.  com
        /**
         * Job 1: Parse XML input and read title,links
         */
        Configuration conf = new Configuration();
        conf.set("xmlinput.start", "<page>");
        conf.set("xmlinput.end", "</page>");

        Job job = Job.getInstance(conf);
        job.setJarByClass(PageRank.class);

        // specify a mapper
        job.setMapperClass(RedLinkMapper.class);

        // specify a reducer
        job.setReducerClass(RedLinkReducer.class);

        // specify output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(XmlInputFormat.class);

        FileOutputFormat.setOutputPath(job, new Path((args[1] + tmp + "/job1")));
        job.setOutputFormatClass(TextOutputFormat.class);

        job.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job1.");
        return 2;
    }
    /**
     * Job 2: Adjacency outGraph
     */
    try {
        Configuration conf2 = new Configuration();

        Job job2 = Job.getInstance(conf2);
        job2.setJarByClass(PageRank.class);

        // specify a mapper
        job2.setMapperClass(AdjMapper.class);

        // specify a reducer
        job2.setReducerClass(AdjReducer.class);

        // specify output types
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job2, new Path((args[1] + tmp + "/job1")));
        job2.setInputFormatClass(TextInputFormat.class);

        FileOutputFormat.setOutputPath(job2, new Path((args[1] + tmp + "/job2")));
        job2.setOutputFormatClass(TextOutputFormat.class);

        job2.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job2.");
        return 2;
    }
    /**
     * Job 3: PageCount
     */
    try {
        Configuration conf3 = new Configuration();
        /**
         * Change output separator to "=" instead of default \t for this job
         */
        conf3.set("mapreduce.output.textoutputformat.separator", "=");

        Job job3 = Job.getInstance(conf3);
        job3.setJarByClass(PageRank.class);

        // specify a mapper
        job3.setMapperClass(PageCountMapper.class);

        // specify a reducer
        job3.setReducerClass(PageCountReducer.class);

        // specify output types
        job3.setOutputKeyClass(Text.class);
        job3.setOutputValueClass(IntWritable.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job3, new Path((args[1] + tmp + "/job2")));
        job3.setInputFormatClass(TextInputFormat.class);

        FileOutputFormat.setOutputPath(job3, new Path((args[1] + tmp + "/job3")));
        job3.setOutputFormatClass(TextOutputFormat.class);

        job3.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job3.");
        return 2;
    }
    /**
     * Job 4: PageRank
     */
    for (int i = 1; i < 9; i++) {
        try {
            Configuration conf4 = new Configuration();
            /**
             * Read number of nodes from the output of job 3 : pageCount
             */
            Path path = new Path((args[1] + tmp + "/job3"));
            FileSystem fs = path.getFileSystem(conf4);
            RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true);

            int n = 0;
            Pattern pt = Pattern.compile("(\\d+)");
            while (ri.hasNext()) {
                LocatedFileStatus lfs = ri.next();
                if (lfs.isFile() && n == 0) {
                    FSDataInputStream inputStream = fs.open(lfs.getPath());
                    BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                    String s = null;
                    while ((s = br.readLine()) != null) {
                        Matcher mt = pt.matcher(s);
                        if (mt.find()) {
                            n = new Integer(mt.group(1));
                            break;
                        }
                    }
                }
            }
            /**
             * Done reading number of nodes, make it available to MapReduce
             * job key: N
             */
            conf4.setInt("N", n);

            Job job4 = Job.getInstance(conf4);
            job4.setJarByClass(PageRank.class);

            // specify a mapper
            job4.setMapperClass(PageRankMapper.class);

            // specify a reducer
            job4.setReducerClass(PageRankReducer.class);

            // specify output types
            job4.setOutputKeyClass(Text.class);
            job4.setOutputValueClass(Text.class);

            // specify input and output DIRECTORIES
            if (i == 1) {
                FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job2")));
            } else {
                FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job4/" + (i - 1))));
            }
            job4.setInputFormatClass(TextInputFormat.class);

            FileOutputFormat.setOutputPath(job4, new Path((args[1] + tmp + "/job4/" + i)));
            job4.setOutputFormatClass(TextOutputFormat.class);
            job4.waitForCompletion(true);
        } catch (InterruptedException | ClassNotFoundException | IOException ex) {
            Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
            System.err.println("Error during mapreduce job4.");
            return 2;
        }
    }
    /**
     * Job 5: Sort iteration 1 and iteration 8
     */
    int returnCode = 0;
    for (int i = 0; i < 2; i++) {
        try {
            Configuration conf5 = new Configuration();

            /**
             * Read number of nodes from the output of job 3 : pageCount
             */
            Path path = new Path((args[1] + tmp + "/job3"));
            FileSystem fs = path.getFileSystem(conf5);
            RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true);

            int n = 0;
            Pattern pt = Pattern.compile("(\\d+)");
            while (ri.hasNext()) {
                LocatedFileStatus lfs = ri.next();
                if (lfs.isFile() && n == 0) {
                    FSDataInputStream inputStream = fs.open(lfs.getPath());
                    BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                    String s = null;
                    while ((s = br.readLine()) != null) {
                        Matcher mt = pt.matcher(s);
                        if (mt.find()) {
                            n = new Integer(mt.group(1));
                            break;
                        }
                    }
                }
            }
            /**
             * Done reading number of nodes, make it available to MapReduce
             * job key: N
             */
            conf5.setInt("N", n);

            Job job5 = Job.getInstance(conf5);
            /**
             * one reducer only
             */
            job5.setNumReduceTasks(1);
            job5.setSortComparatorClass(MyWritableComparator.class);
            job5.setJarByClass(PageRank.class);

            // specify a mapper
            job5.setMapperClass(SortMapper.class);
            job5.setMapOutputKeyClass(DoubleWritable.class);
            job5.setMapOutputValueClass(Text.class);

            // specify a reducer
            job5.setReducerClass(SortReducer.class);

            // specify output types
            job5.setOutputKeyClass(Text.class);
            job5.setOutputValueClass(DoubleWritable.class);

            // specify input and output DIRECTORIES
            int y = 7 * i + 1;
            FileInputFormat.addInputPath(job5, new Path((args[1] + tmp + "/job4/" + y)));
            job5.setInputFormatClass(TextInputFormat.class);

            FileOutputFormat.setOutputPath(job5, new Path((args[1] + tmp + "/job5/" + y)));
            job5.setOutputFormatClass(TextOutputFormat.class);

            returnCode = job5.waitForCompletion(true) ? 0 : 1;
        } catch (InterruptedException | ClassNotFoundException | IOException ex) {
            Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
            System.err.println("Error during mapreduce job5.");
            return 2;
        }
    }
    /**
     * Copy necessary output files to args[1]        /**
     * Copy necessary output files to args[1]
     */

    /**
     * Rename and copy OutLinkGraph
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job2/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.outlink.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy total number of pages
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job3/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.n.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy iteration 1
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job5/1/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.iter1.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy iteration 8
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job5/8/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.iter8.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }
    return returnCode;
}

From source file:com.basho.riak.hadoop.RiakWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    String[] keys = new String[10000];

    for (int i = 0; i < 10000; i++) {
        keys[i] = String.valueOf(i + 1000);
    }//from   ww  w .j a v  a  2 s . c  o  m
    Configuration conf = getConf();
    conf = RiakConfig.setKeyLister(conf, new BucketKeyLister("wordcount"));
    conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 11087));
    conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 12087));
    conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 13087));
    conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 14087));
    conf = RiakConfig.addLocation(conf, new RiakLocation("127.0.0.1", 15087));
    conf = RiakConfig.setOutputBucket(conf, "wordcount_out");
    conf = RiakConfig.setHadoopClusterSize(conf, 4);

    Job job = new Job(conf, "Riak-WordCount");

    job.setJarByClass(RiakWordCount.class);

    job.setInputFormatClass(RiakInputFormat.class);
    job.setMapperClass(TokenCounterMapper.class);

    job.setReducerClass(TokenCounterReducer.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputFormatClass(RiakOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(WordCountResult.class);

    job.setNumReduceTasks(4);

    job.submit();
    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:com.baynote.kafka.hadoop.KafkaJobBuilder.java

License:Apache License

/**
 * Creates a {@link Job} based on how {@code this} {@link KafkaJobBuilder} has been configured. There are no
 * side-effects on {@code this} instance when you call this method, so you can call it multiple times.
 * /*from www.  ja va  2 s  .c  om*/
 * @param conf
 *            the job conf.
 * @return a fully configured {@link Job}.
 * @throws Exception error
 * @throws IllegalArgumentException
 *             if any required parameters are not set.
 */
public Job configureJob(final Configuration conf) throws Exception {
    validateSettings();
    final Job job = Job.getInstance(conf, getDefaultedJobName());

    // set queue inputs
    if (getQueueMappers().size() == 1) {
        job.setInputFormatClass(KafkaInputFormat.class);
        final TopicConf topicConf = Iterables.getOnlyElement(getQueueMappers());
        KafkaInputFormat.setTopic(job, topicConf.getTopic());
        KafkaInputFormat.setConsumerGroup(job, topicConf.getConsumerGroup());
        job.setMapperClass(topicConf.getMapper());
    } else {
        job.setInputFormatClass(MultipleKafkaInputFormat.class);
        for (final TopicConf topicConf : getQueueMappers()) {
            MultipleKafkaInputFormat.addTopic(job, topicConf.getTopic(), topicConf.getConsumerGroup(),
                    topicConf.getMapper());
        }
    }

    if (getMapOutputKeyClass() != null) {
        job.setMapOutputKeyClass(getMapOutputKeyClass());
    }

    if (getMapOutputValueClass() != null) {
        job.setMapOutputValueClass(getMapOutputValueClass());
    }

    if (getReducerClass() == null) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(getReducerClass());
        job.setNumReduceTasks(getNumReduceTasks());
    }

    if (getPartitionerClass() != null) {
        job.setPartitionerClass(getPartitionerClass());
    }

    // set output
    job.setOutputFormatClass(getOutputFormatClass());
    job.setOutputKeyClass(getOutputKeyClass());
    job.setOutputValueClass(getOutputValueClass());
    if (getOutputFormat() == SupportedOutputFormat.TEXT_FILE) {
        TextOutputFormat.setOutputPath(job, getDefaultedOutputPath());
    } else if (getOutputFormat() == SupportedOutputFormat.SEQUENCE_FILE) {
        SequenceFileOutputFormat.setOutputPath(job, getDefaultedOutputPath());
    }

    if (usingS3()) {
        job.getConfiguration().set("fs.s3n.awsAccessKeyId", getS3AccessKey());
        job.getConfiguration().set("fs.s3n.awsSecretAccessKey", getS3SecretyKey());
        job.getConfiguration().set("fs.s3.awsAccessKeyId", getS3AccessKey());
        job.getConfiguration().set("fs.s3.awsSecretAccessKey", getS3SecretyKey());
    }

    if (isLazyOutputFormat()) {
        LazyOutputFormat.setOutputFormatClass(job, getOutputFormatClass());
    }

    // setup kafka input format specifics
    KafkaInputFormat.setZkConnect(job, getZkConnect());
    KafkaInputFormat.setKafkaFetchSizeBytes(job, getKafkaFetchSizeBytes());

    job.setSpeculativeExecution(false);
    job.setJarByClass(getClass());

    // memory settings for mappers
    if (!Strings.isNullOrEmpty(getTaskMemorySettings())) {
        job.getConfiguration().set("mapred.child.java.opts", getTaskMemorySettings());
    }

    return job;
}

From source file:com.benchmark.mapred.SecondarySort.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: secondarysrot <in> <out>");
        System.exit(2);/*from www . j  av a  2s .co m*/
    }
    Job job = new Job(conf, "secondary sort");
    job.setJarByClass(SecondarySort.class);
    job.setMapperClass(MapClass.class);
    job.setReducerClass(Reduce.class);

    // group and partition by the first int in the pair
    job.setPartitionerClass(FirstPartitioner.class);
    job.setGroupingComparatorClass(FirstGroupingComparator.class);

    // the map output is IntPair, IntWritable
    job.setMapOutputKeyClass(IntPair.class);
    job.setMapOutputValueClass(IntWritable.class);

    // the reduce output is Text, IntWritable
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.benchmark.mapred.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = new Job(conf, "wordcount");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {//from   w w w  .  ja v  a2 s  .c  o  m
            if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            System.err.println("Usage: wordcount <numReduces> <in> <out>");
            System.exit(2);
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            System.err.println("Usage: wordcount <numReduces> <in> <out>");
            System.exit(2);
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        System.err.println("Usage: wordcount <numReduces> <in> <out>");
        System.exit(2);
    }

    FileInputFormat.addInputPath(job, new Path(other_args.get(0)));
    FileOutputFormat.setOutputPath(job, new Path(other_args.get(1)));
    Date startIteration = new Date();
    Boolean waitforCompletion = job.waitForCompletion(true);
    Date endIteration = new Date();
    System.out.println(
            "The iteration took " + (endIteration.getTime() - startIteration.getTime()) / 1000 + " seconds.");
    System.exit(waitforCompletion ? 0 : 1);
}

From source file:com.bigdog.hadoop.mapreduce.partition.KpiApp.java

public void kpi() throws Exception {
    final Job job = new Job(new Configuration(), KpiApp.class.getSimpleName());

    job.setJarByClass(KpiApp.class);

    //1.1 //from  ww w .  j a  v  a 2 s. c om
    FileInputFormat.setInputPaths(job, INPUT_PATH);
    //??
    job.setInputFormatClass(TextInputFormat.class);

    //1.2Mapper
    job.setMapperClass(MyMapper.class);
    //<k2,v2>
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(KpiWritable.class);

    //1.3 
    job.setPartitionerClass(KpiPartitioner.class);
    job.setNumReduceTasks(2);

    //1.4 TODO ??
    //1.5  TODO ??
    //2.2 reduce
    job.setReducerClass(MyReducer.class);
    //<k3,v3>
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(KpiWritable.class);

    //2.3 
    FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
    //?
    job.setOutputFormatClass(TextOutputFormat.class);

    //???JobTracker
    job.waitForCompletion(true);
}

From source file:com.bigfishgames.biginsights.upsight.mapreduce.MapReduceAvroWordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: AvroWordCount <input path> <output path>");
        return -1;
    }//from ww w . j  a v  a  2s  . c  o m

    Job job = new Job(getConf());
    job.setJarByClass(MapReduceAvroWordCount.class);
    job.setJobName("wordcount");

    // We call setOutputSchema first so we can override the configuration
    // parameters it sets
    // AvroJob.setOutputKeySchema(job,
    //                         Pair.getPairSchema(Schema.create(Type.STRING),
    //                                           Schema.create(Type.NULL)));
    AvroJob.setOutputKeySchema(job, Event.getClassSchema());

    job.setOutputValueClass(NullWritable.class);

    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(MyAvroKeyOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    job.setSortComparatorClass(Text.Comparator.class);

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:com.binbo.wordcount.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from   w  ww . j  a v  a  2s.co  m
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class); // Set the combiner
    job.setPartitionerClass(WordPartitioner.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.bizosys.hsearch.kv.indexer.KVIndexer.java

License:Apache License

/**
 * Given a indexing parameters it starts a indexing.
 * Different indexing type are:/*from w  w  w .ja  v a2s. c om*/
 * SF2HB = Simple File(csv,tsv) to hbase directly.
 * SF2HF = Simple File(csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * SF2MF = Simple File(csv,tsv) to MapFile (key as {@link Text} and value as {@link BytesWritable})
 * MF2HB = Map File(key and value as csv,tsv) to hbase.
 * MF2HF = Map File(key and value as csv,tsv) to HFile, which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * MF2MF = Map File(key and value as csv,tsv) to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * HB2HB = Hbase to Hbase
 * HB2HF = Hbase to HFile which can be loaded to Hbase using LoadIncrementalHfiles. class from hbase.
 * HB2MF = Hbase to MapFile(key as {@link Text} and value as {@link BytesWritable})
 * @param args
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void execute(String[] args) throws IOException, InterruptedException, ClassNotFoundException {

    if (args.length < 7) {
        String err = "Usage : " + KVIndexer.class
                + " <<Job Type(SF2HB|SF2HF|SF2MF...)>> <<Input Source>> <<Output Sink>> <<XML File Configuration>> <<Skip Header(true|false)>> <<Run KeyGeneration Job>> <<Number Of reducer>> <<Speculative Execution>> <<scanner-cache-size>> <<filter>>";
        IdSearchLog.l.fatal(err);
        System.exit(1);
    }

    String msg = this.getClass().getName() + " > Initializing indexer job.";
    IdSearchLog.l.info(msg);

    int seq = 0;
    int len = args.length;

    String jobType = (len > seq) ? args[seq++] : "";
    String inputSource = (len > seq) ? args[seq++] : "";
    String outputSink = (len > seq) ? args[seq++] : "/tmp/hsearch-index";
    String xmlFilePath = (len > seq) ? args[seq++] : "";
    String skipHeader = (len > seq) ? args[seq++] : "false";
    boolean runKeyGenJob = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : false;
    int numberOfReducer = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 1;
    boolean speculativeExecution = (len > seq) ? args[seq++].trim().equalsIgnoreCase("true") : true;
    int scannerCacheSize = (len > seq) ? Integer.parseInt(args[seq++].trim()) : 300;
    String filter = (len > seq) ? args[seq++] : "";

    if (isEmpty(jobType)) {
        String err = this.getClass().getName()
                + " > Please enter Job type as one of these :\n SF2HB|SF2HF|SF2MF|MF2HB|MF2HF|MF2MF|HB2HB|HB2HF|HB2MF|IMF2HF";
        System.err.println(err);
        throw new IOException(err);
    }

    if (isEmpty(inputSource)) {
        String err = this.getClass().getName() + " > Please enter input file path.";
        System.err.println(err);
        throw new IOException(err);
    }

    Configuration conf = HBaseConfiguration.create();

    FieldMapping fm = createFieldMapping(conf, xmlFilePath, new StringBuilder());
    outputSink = outputSink.charAt(outputSink.length() - 1) == '/' ? outputSink : outputSink + "/";
    outputSink = outputSink + fm.tableName;

    createHBaseTable(fm);

    KVIndexer.FAM_NAME = fm.familyName.getBytes();
    KVIndexer.FIELD_SEPARATOR = fm.fieldSeparator;

    conf.set(XML_FILE_PATH, xmlFilePath);
    conf.set(OUTPUT_FOLDER, outputSink);
    conf.set(SKIP_HEADER, skipHeader);
    conf.set(RAW_FILE_SEPATATOR, String.valueOf(fm.fieldSeparator));

    Job job = Job.getInstance(conf, "com.bizosys.hsearch.kv.indexing.KVIndexer type : " + jobType + "\n"
            + inputSource + "\n" + outputSink);
    job.setJarByClass(this.getClass());
    job.setNumReduceTasks(numberOfReducer);

    Integer jobTypeI = JobTypeMapping.get(jobType);
    if (jobTypeI == null)
        throw new IOException("Invalid Jobtype " + jobType);

    /**
     *  if internal keyIndex is given then generate the keys first and then do indexing 
     *  else just run indexer by creating keys from hbase 
     */
    boolean keyGenjobStatus = false;
    if (-1 != fm.internalKey && runKeyGenJob) {

        Configuration keyGenConf = HBaseConfiguration.create();
        keyGenConf.set(INPUT_SOURCE, inputSource);
        keyGenConf.set(XML_FILE_PATH, xmlFilePath);
        keyGenConf.set(OUTPUT_FOLDER, outputSink);
        keyGenConf.set(SKIP_HEADER, skipHeader);

        Job keyGenJob = Job.getInstance(keyGenConf, "Creating Keys KVKeyGenerator for " + inputSource);

        switch (jobTypeI) {
        case SF2HB:
        case SF2HF:
        case SF2MF: {

            FileInputFormat.addInputPath(keyGenJob, new Path(inputSource));

            keyGenJob.setMapperClass(KVKeyGeneratorMapperFile.class);
            keyGenJob.setInputFormatClass(TextInputFormat.class);
            keyGenJob.setMapOutputKeyClass(Text.class);
            keyGenJob.setMapOutputValueClass(Text.class);

            keyGenJob.setReducerClass(KVKeyGeneratorReducerFile.class);
            keyGenJob.setNumReduceTasks(numberOfReducer);
            keyGenJob.setOutputKeyClass(NullWritable.class);
            keyGenJob.setOutputValueClass(Text.class);

            inputSource = outputSink + "_" + INPUTWITH_KEY;
            Path intermediatePath = new Path(inputSource);
            System.out.println("Final input path " + inputSource);
            FileOutputFormat.setOutputPath(keyGenJob, intermediatePath);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }

            break;
        }
        case HB2HB:
        case HB2HF:
        case HB2MF: {

            Scan scan = new Scan();
            scan.setCaching(scannerCacheSize);
            scan.setCacheBlocks(false);

            byte[] family = fm.familyName.getBytes();
            for (String name : fm.nameWithField.keySet()) {

                Field fld = fm.nameWithField.get(name);
                if (!fld.isMergedKey)
                    continue;
                scan.addColumn(family, fld.sourceName.trim().getBytes());
            }

            TableMapReduceUtil.initTableMapperJob(inputSource, // input table
                    scan, // Scan instance to control CF and attribute selection
                    KVKeyGeneratorMapperHBase.class, // mapper class
                    Text.class, // mapper output key
                    ImmutableBytesWritable.class, // mapper output value
                    keyGenJob);

            TableMapReduceUtil.initTableReducerJob(inputSource, // output table
                    KVKeyGeneratorReducerHBase.class, // reducer class
                    keyGenJob);

            keyGenjobStatus = keyGenJob.waitForCompletion(true);
            if (!keyGenjobStatus) {
                throw new IOException("Error in running Job for Key Generation");
            }
            break;
        }
        default:
            break;
        }
    }
    /*
     * Run job based on job type eg. SF2HB,SF2MF,SF2HF etc.
     */
    System.out.println("Sending path " + inputSource);
    runJob(jobTypeI, job, fm, inputSource, outputSink, scannerCacheSize, filter);
}