Example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setNumReduceTasks.

Prototype

public void setNumReduceTasks(int n)

Source Link

Document

Set the requisite number of reduce tasks for this job.

Usage

From source file:com.digitalpebble.behemoth.languageidentification.LanguageIdDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input file or directory");
    options.addOption("o", "output", true, "output Behemoth corpus");
    options.addOption("w", "overwrite", false, "overwrite the output");

    Path inputPath = null;//www.  j  a v a 2 s .  c o  m
    Path outputPath = null;

    boolean overWrite = false;

    // parse the command line arguments
    CommandLine cmdLine = null;
    try {
        cmdLine = parser.parse(options, args);
        String input = cmdLine.getOptionValue("i");
        String output = cmdLine.getOptionValue("o");
        if (cmdLine.hasOption("help")) {
            formatter.printHelp("LanguageIdDriver", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("LanguageIdDriver", options);
            return -1;
        }
        inputPath = new Path(input);
        outputPath = new Path(output);
        if (cmdLine.hasOption("overwrite")) {
            overWrite = true;
        }
    } catch (ParseException e) {
        formatter.printHelp("LanguageIdDriver", options);
    }

    // check whether needs overwriting
    if (FileSystem.get(outputPath.toUri(), getConf()).exists(outputPath)) {
        if (!overWrite) {
            System.out.println("Output path " + outputPath + " already exists. Use option -w to overwrite.");
            return 0;
        } else
            fs.delete(outputPath, true);
    }

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("Processing with Language Identifier");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(LanguageIdMapper.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (log.isInfoEnabled()) {
            log.info("LanguagedIdDriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        log.error(e.getMessage(), e);
        fs.delete(outputPath, true);
        return -1;
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.mahout.util.ClusterDocIDDumper.java

License:Apache License

public int extract(Path input, Path output) throws IOException {
    JobConf job = new JobConf(getConf());
    // job.setJobName(this.getClass().getName());
    job.setJarByClass(this.getClass());
    FileInputFormat.addInputPath(job, input);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setMapperClass(ClusterDocIDDumper.class);
    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    RunningJob rj = JobClient.runJob(job);

    if (rj.isSuccessful() == false)
        return -1;
    return 0;//  ww w.  ja va  2  s  . c o m
}

From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java

License:Apache License

public int vectorToString(Path vectorPath, Path output) throws IOException {
    JobConf job = new JobConf(getConf());
    // job.setJobName(this.getClass().getName());
    job.setJarByClass(this.getClass());
    FileInputFormat.addInputPath(job, vectorPath);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setNumReduceTasks(0);
    job.setMapperClass(Mahout2LibSVM.class);
    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    RunningJob rj = JobClient.runJob(job);

    if (rj.isSuccessful() == false)
        return -1;
    return 0;/*from  ww w  .  j  a  v  a2  s . c om*/
}

From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java

License:Apache License

public int convert(Path vectorPath, Path labelPath, Path output) throws IOException {
    JobConf job = new JobConf(getConf());
    // job.setJobName(this.getClass().getName());
    job.setJarByClass(this.getClass());
    FileInputFormat.addInputPath(job, vectorPath);
    FileInputFormat.addInputPath(job, labelPath);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    // 1 reducers
    job.setNumReduceTasks(1);
    job.setReducerClass(Mahout2LibSVM.class);
    FileOutputFormat.setOutputPath(job, output);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    RunningJob rj = JobClient.runJob(job);
    boolean success = rj.isSuccessful();
    if (!success)
        return -1;
    if (log.isInfoEnabled()) {
        log.info("Conversion: done");
    }//from ww  w.j  av  a2 s.co m
    return 0;
}

From source file:com.digitalpebble.behemoth.solr.LucidWorksIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.LucidWorksIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }//from ww w. j av  a2s . c  o  m

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into LucidWorks");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(LucidWorksOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("LucidWorksIndexerJob completed. Time " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.solr.SOLRIndexerJob.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 2) {
        String syntax = "com.digitalpebble.solr.SOLRIndexerJob in solrURL";
        System.err.println(syntax);
        return -1;
    }// w  w w  . j ava 2s. c o m

    Path inputPath = new Path(args[0]);
    String solrURL = args[1];

    JobConf job = new JobConf(getConf());

    job.setJarByClass(this.getClass());

    job.setJobName("Indexing " + inputPath + " into SOLR");

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SOLROutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(IdentityMapper.class);
    // no reducer : send straight to SOLR at end of mapping
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());
    FileOutputFormat.setOutputPath(job, tmp);

    job.set("solr.server.url", solrURL);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("SOLRIndexerJob completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error(e);
    } finally {
        fs.delete(tmp, true);
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.tika.TikaDriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());
    GroupBuilder gBuilder = new GroupBuilder().withName("Options:");
    List<Option> options = new ArrayList<Option>();
    Option inputOpt = buildOption("input", "i", "The input path", true, true, null);
    options.add(inputOpt);//from w w  w. jav  a2 s.co  m
    Option outOpt = buildOption("output", "o", "The output path", true, true, null);
    options.add(outOpt);
    Option tikaOpt = buildOption("tikaProcessor", "t",
            "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true,
            false, null);
    options.add(tikaOpt);
    Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, "");
    options.add(mimeTypeOpt);
    for (Option opt : options) {
        gBuilder = gBuilder.withOption(opt);
    }

    Group group = gBuilder.create();

    try {
        Parser parser = new Parser();
        parser.setGroup(group);
        // TODO catch exceptions with parsing of opts
        CommandLine cmdLine = parser.parse(args);
        Path inputPath = new Path(cmdLine.getValue(inputOpt).toString());
        Path outputPath = new Path(cmdLine.getValue(outOpt).toString());
        String handlerName = null;
        if (cmdLine.hasOption(tikaOpt)) {
            handlerName = cmdLine.getValue(tikaOpt).toString();
        }

        JobConf job = new JobConf(getConf());
        job.setJarByClass(this.getClass());

        if (cmdLine.hasOption(mimeTypeOpt)) {
            String mimeType = cmdLine.getValue(mimeTypeOpt).toString();
            job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType);
        }

        if (handlerName != null && handlerName.equals("") == false) {
            job.set(TIKA_PROCESSOR_KEY, handlerName);
        }

        job.setJobName("Tika : " + inputPath.toString());

        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BehemothDocument.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BehemothDocument.class);

        job.setMapperClass(TikaMapper.class);

        boolean isFilterRequired = BehemothReducer.isRequired(job);
        if (isFilterRequired)
            job.setReducerClass(BehemothReducer.class);
        else {
            job.setNumReduceTasks(0);
        }

        FileInputFormat.addInputPath(job, inputPath);
        FileOutputFormat.setOutputPath(job, outputPath);

        try {
            long start = System.currentTimeMillis();
            JobClient.runJob(job);
            long finish = System.currentTimeMillis();
            if (log.isInfoEnabled()) {
                log.info("TikaDriver completed. Timing: " + (finish - start) + " ms");
            }
        } catch (Exception e) {
            log.error("Exception", e);
            return -1;
            // don't delete the output as some of it could be used
            // fs.delete(outputPath, true);
        } finally {
        }

    } catch (OptionException e) {
        log.error("OptionException", e.getMessage());
        HelpFormatter formatter = new HelpFormatter();
        formatter.setGroup(group);
        formatter.print();
        return -1;
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.uima.UIMADriver.java

License:Apache License

public int run(String[] args) throws Exception {

    final FileSystem fs = FileSystem.get(getConf());

    if (args.length != 3) {
        String syntax = "com.digitalpebble.behemoth.uima.UIMADriver in out path_pear_file";
        System.err.println(syntax);
        return -1;
    }/*from w  w  w  .  java 2s  . co  m*/

    Path inputPath = new Path(args[0]);
    Path outputPath = new Path(args[1]);
    String pearPath = args[2];

    // check that the GATE application has been stored on HDFS
    Path zap = new Path(pearPath);
    if (fs.exists(zap) == false) {
        System.err.println("The UIMA application " + pearPath + "can't be found on HDFS - aborting");
        return -1;
    }

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());
    job.setJobName("Processing with UIMA application : " + pearPath);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    job.setMapperClass(UIMAMapper.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // push the UIMA pear onto the DistributedCache
    DistributedCache.addCacheFile(new URI(pearPath), job);

    job.set("uima.pear.path", pearPath);

    try {
        long start = System.currentTimeMillis();
        JobClient.runJob(job);
        long finish = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("UIMADriver completed. Timing: " + (finish - start) + " ms");
        }
    } catch (Exception e) {
        LOG.error("Exception", e);
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.digitalpebble.behemoth.util.CorpusFilter.java

License:Apache License

public int run(String[] args) throws Exception {

    Options options = new Options();
    // automatically generate the help statement
    HelpFormatter formatter = new HelpFormatter();
    // create the parser
    CommandLineParser parser = new GnuParser();

    options.addOption("h", "help", false, "print this message");
    options.addOption("i", "input", true, "input Behemoth corpus");
    options.addOption("o", "output", true, "output Behemoth corpus");

    // parse the command line arguments
    CommandLine line = null;//w w  w  .j  a va 2s  .  co m
    try {
        line = parser.parse(options, args);
        String input = line.getOptionValue("i");
        String output = line.getOptionValue("o");
        if (line.hasOption("help")) {
            formatter.printHelp("CorpusFilter", options);
            return 0;
        }
        if (input == null | output == null) {
            formatter.printHelp("CorpusFilter", options);
            return -1;
        }
    } catch (ParseException e) {
        formatter.printHelp("CorpusFilter", options);
    }

    final FileSystem fs = FileSystem.get(getConf());

    Path inputPath = new Path(line.getOptionValue("i"));
    Path outputPath = new Path(line.getOptionValue("o"));

    JobConf job = new JobConf(getConf());
    job.setJarByClass(this.getClass());

    job.setJobName("CorpusFilter : " + inputPath.toString());

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(BehemothDocument.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(BehemothDocument.class);

    boolean isFilterRequired = BehemothMapper.isRequired(job);
    // should be the case here
    if (!isFilterRequired) {
        System.err.println("No filters configured. Check your behemoth-site.xml");
        return -1;
    }
    job.setMapperClass(BehemothMapper.class);
    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    try {
        JobClient.runJob(job);
    } catch (Exception e) {
        e.printStackTrace();
        fs.delete(outputPath, true);
    } finally {
    }

    return 0;
}

From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java

License:Apache License

private static void writePartitionFile(JobConf job, Sampler sampler) {
    try {// www. java 2 s . com
        ////////////////////////////////////////////////
        // first, getting samples from the data sources
        ////////////////////////////////////////////////
        LOGGER.info("Running local sampling for job [" + job.getJobName() + "]");
        InputFormat inf = job.getInputFormat();
        Object[] samples = sampler.getSample(inf, job);
        LOGGER.info("Samples retrieved, sorting...");

        ////////////////////////////////////////////////
        // sort the samples
        ////////////////////////////////////////////////
        RawComparator comparator = job.getOutputKeyComparator();
        Arrays.sort(samples, comparator);

        if (job.getBoolean("mobius.print.sample", false)) {
            PrintWriter pw = new PrintWriter(
                    new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream(
                            new File(job.get("mobius.sample.file", "./samples.txt.gz")))))));
            for (Object obj : samples) {
                pw.println(obj);
            }
            pw.flush();
            pw.close();
        }

        ////////////////////////////////////////////////
        // start to write partition files
        ////////////////////////////////////////////////

        FileSystem fs = FileSystem.get(job);
        Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job)));
        while (fs.exists(partitionFile)) {
            partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis());
        }
        fs.deleteOnExit(partitionFile);
        TotalOrderPartitioner.setPartitionFile(job, partitionFile);
        LOGGER.info("write partition file to:" + partitionFile.toString());

        int reducersNbr = job.getNumReduceTasks();
        Set<Object> wroteSamples = new HashSet<Object>();

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class,
                NullWritable.class);

        float avgReduceSize = samples.length / reducersNbr;

        int lastBegin = 0;
        for (int i = 0; i < samples.length;) {
            // trying to distribute the load for every reducer evenly,
            // dividing the <code>samples</code> into a set of blocks
            // separated by boundaries, objects that selected from the
            // <code>samples</code> array, and each blocks should have
            // about the same size.

            // find the last index of element that equals to samples[i], as
            // such element might appear multiple times in the samples.
            int upperBound = Util.findUpperBound(samples, samples[i], comparator);

            int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator);

            // the repeat time of samples[i], if the key itself is too big
            // select it as boundary
            int currentElemSize = upperBound - lowerBound + 1;

            if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size
            {
                // the current element is too big, greater than
                // two times of the <code>avgReduceSize</code>, 
                // put itself as boundary
                writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                //pw.println(samples[i]);

                // immediate put the next element to the boundary,
                // the next element starts at <code> upperBound+1
                // </code>, to prevent the current one consume even 
                // more.
                if (upperBound + 1 < samples.length) {
                    writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey());
                    //pw.println(samples[upperBound+1]);

                    // move on to the next element of <code>samples[upperBound+1]/code>
                    lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1;
                    i = lastBegin;
                } else {
                    break;
                }
            } else {
                // current element is small enough to be consider
                // with previous group
                int size = upperBound - lastBegin;
                if (size > avgReduceSize) {
                    // by including the current elements, we have
                    // found a block that's big enough, select it
                    // as boundary
                    writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get());
                    wroteSamples.add(((DataJoinKey) samples[i]).getKey());
                    //pw.println(samples[i]);

                    i = upperBound + 1;
                    lastBegin = i;
                } else {
                    i = upperBound + 1;
                }
            }
        }

        writer.close();

        // if the number of wrote samples doesn't equals to number of
        // reducer minus one, then it means the key spaces is too small
        // hence TotalOrderPartitioner won't work, it works only if 
        // the partition boundaries are distinct.
        //
        // we need to change the number of reducers
        if (wroteSamples.size() + 1 != reducersNbr) {
            LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size()
                    + ", reducer size:" + (reducersNbr));
            LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1));

            // add 1 because the wrote samples define boundary, ex, if
            // the sample size is two with two element [300, 1000], then 
            // there should be 3 reducers, one for handling i<300, one 
            // for n300<=i<1000, and another one for 1000<=i
            job.setNumReduceTasks((wroteSamples.size() + 1));
        }

        samples = null;
    } catch (IOException e) {
        LOGGER.error(e.getMessage(), e);
        throw new RuntimeException(e);
    }
}