List of usage examples for org.apache.hadoop.mapred JobConf setNumReduceTasks
public void setNumReduceTasks(int n)
From source file:com.digitalpebble.behemoth.languageidentification.LanguageIdDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input file or directory"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("w", "overwrite", false, "overwrite the output"); Path inputPath = null;//www. j a v a 2 s . c o m Path outputPath = null; boolean overWrite = false; // parse the command line arguments CommandLine cmdLine = null; try { cmdLine = parser.parse(options, args); String input = cmdLine.getOptionValue("i"); String output = cmdLine.getOptionValue("o"); if (cmdLine.hasOption("help")) { formatter.printHelp("LanguageIdDriver", options); return 0; } if (input == null | output == null) { formatter.printHelp("LanguageIdDriver", options); return -1; } inputPath = new Path(input); outputPath = new Path(output); if (cmdLine.hasOption("overwrite")) { overWrite = true; } } catch (ParseException e) { formatter.printHelp("LanguageIdDriver", options); } // check whether needs overwriting if (FileSystem.get(outputPath.toUri(), getConf()).exists(outputPath)) { if (!overWrite) { System.out.println("Output path " + outputPath + " already exists. Use option -w to overwrite."); return 0; } else fs.delete(outputPath, true); } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Processing with Language Identifier"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(LanguageIdMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("LanguagedIdDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { log.error(e.getMessage(), e); fs.delete(outputPath, true); return -1; } finally { } return 0; }
From source file:com.digitalpebble.behemoth.mahout.util.ClusterDocIDDumper.java
License:Apache License
public int extract(Path input, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, input); job.setInputFormat(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setMapperClass(ClusterDocIDDumper.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); if (rj.isSuccessful() == false) return -1; return 0;// ww w. ja va 2 s . c o m }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int vectorToString(Path vectorPath, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, vectorPath); job.setInputFormat(SequenceFileInputFormat.class); job.setNumReduceTasks(0); job.setMapperClass(Mahout2LibSVM.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); if (rj.isSuccessful() == false) return -1; return 0;/*from ww w . j a v a2 s . c om*/ }
From source file:com.digitalpebble.behemoth.mahout.util.Mahout2LibSVM.java
License:Apache License
public int convert(Path vectorPath, Path labelPath, Path output) throws IOException { JobConf job = new JobConf(getConf()); // job.setJobName(this.getClass().getName()); job.setJarByClass(this.getClass()); FileInputFormat.addInputPath(job, vectorPath); FileInputFormat.addInputPath(job, labelPath); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(IdentityMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); // 1 reducers job.setNumReduceTasks(1); job.setReducerClass(Mahout2LibSVM.class); FileOutputFormat.setOutputPath(job, output); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); RunningJob rj = JobClient.runJob(job); boolean success = rj.isSuccessful(); if (!success) return -1; if (log.isInfoEnabled()) { log.info("Conversion: done"); }//from ww w.j av a2 s.co m return 0; }
From source file:com.digitalpebble.behemoth.solr.LucidWorksIndexerJob.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length != 2) { String syntax = "com.digitalpebble.solr.LucidWorksIndexerJob in solrURL"; System.err.println(syntax); return -1; }//from ww w. j av a2s . c o m Path inputPath = new Path(args[0]); String solrURL = args[1]; JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Indexing " + inputPath + " into LucidWorks"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(LucidWorksOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(IdentityMapper.class); // no reducer : send straight to SOLR at end of mapping job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); job.set("solr.server.url", solrURL); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("LucidWorksIndexerJob completed. Time " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error(e); } finally { fs.delete(tmp, true); } return 0; }
From source file:com.digitalpebble.behemoth.solr.SOLRIndexerJob.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length != 2) { String syntax = "com.digitalpebble.solr.SOLRIndexerJob in solrURL"; System.err.println(syntax); return -1; }// w w w . j ava 2s. c o m Path inputPath = new Path(args[0]); String solrURL = args[1]; JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Indexing " + inputPath + " into SOLR"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SOLROutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(IdentityMapper.class); // no reducer : send straight to SOLR at end of mapping job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); job.set("solr.server.url", solrURL); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("SOLRIndexerJob completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error(e); } finally { fs.delete(tmp, true); } return 0; }
From source file:com.digitalpebble.behemoth.tika.TikaDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); GroupBuilder gBuilder = new GroupBuilder().withName("Options:"); List<Option> options = new ArrayList<Option>(); Option inputOpt = buildOption("input", "i", "The input path", true, true, null); options.add(inputOpt);//from w w w. jav a2 s.co m Option outOpt = buildOption("output", "o", "The output path", true, true, null); options.add(outOpt); Option tikaOpt = buildOption("tikaProcessor", "t", "The fully qualified name of a TikaProcessor class that handles the extraction (optional)", true, false, null); options.add(tikaOpt); Option mimeTypeOpt = buildOption("mimeType", "m", "The mime type to use (optional)", true, false, ""); options.add(mimeTypeOpt); for (Option opt : options) { gBuilder = gBuilder.withOption(opt); } Group group = gBuilder.create(); try { Parser parser = new Parser(); parser.setGroup(group); // TODO catch exceptions with parsing of opts CommandLine cmdLine = parser.parse(args); Path inputPath = new Path(cmdLine.getValue(inputOpt).toString()); Path outputPath = new Path(cmdLine.getValue(outOpt).toString()); String handlerName = null; if (cmdLine.hasOption(tikaOpt)) { handlerName = cmdLine.getValue(tikaOpt).toString(); } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); if (cmdLine.hasOption(mimeTypeOpt)) { String mimeType = cmdLine.getValue(mimeTypeOpt).toString(); job.set(TikaConstants.TIKA_MIME_TYPE_KEY, mimeType); } if (handlerName != null && handlerName.equals("") == false) { job.set(TIKA_PROCESSOR_KEY, handlerName); } job.setJobName("Tika : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TikaMapper.class); boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("TikaDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { log.error("Exception", e); return -1; // don't delete the output as some of it could be used // fs.delete(outputPath, true); } finally { } } catch (OptionException e) { log.error("OptionException", e.getMessage()); HelpFormatter formatter = new HelpFormatter(); formatter.setGroup(group); formatter.print(); return -1; } return 0; }
From source file:com.digitalpebble.behemoth.uima.UIMADriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length != 3) { String syntax = "com.digitalpebble.behemoth.uima.UIMADriver in out path_pear_file"; System.err.println(syntax); return -1; }/*from w w w . java 2s . co m*/ Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); String pearPath = args[2]; // check that the GATE application has been stored on HDFS Path zap = new Path(pearPath); if (fs.exists(zap) == false) { System.err.println("The UIMA application " + pearPath + "can't be found on HDFS - aborting"); return -1; } JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Processing with UIMA application : " + pearPath); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(UIMAMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // push the UIMA pear onto the DistributedCache DistributedCache.addCacheFile(new URI(pearPath), job); job.set("uima.pear.path", pearPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("UIMADriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception", e); fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.util.CorpusFilter.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;//w w w .j a va 2s . co m try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); if (line.hasOption("help")) { formatter.printHelp("CorpusFilter", options); return 0; } if (input == null | output == null) { formatter.printHelp("CorpusFilter", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusFilter", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("CorpusFilter : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); boolean isFilterRequired = BehemothMapper.isRequired(job); // should be the case here if (!isFilterRequired) { System.err.println("No filters configured. Check your behemoth-site.xml"); return -1; } job.setMapperClass(BehemothMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.ebay.erl.mobius.core.mapred.ConfigurableJob.java
License:Apache License
private static void writePartitionFile(JobConf job, Sampler sampler) { try {// www. java 2 s . com //////////////////////////////////////////////// // first, getting samples from the data sources //////////////////////////////////////////////// LOGGER.info("Running local sampling for job [" + job.getJobName() + "]"); InputFormat inf = job.getInputFormat(); Object[] samples = sampler.getSample(inf, job); LOGGER.info("Samples retrieved, sorting..."); //////////////////////////////////////////////// // sort the samples //////////////////////////////////////////////// RawComparator comparator = job.getOutputKeyComparator(); Arrays.sort(samples, comparator); if (job.getBoolean("mobius.print.sample", false)) { PrintWriter pw = new PrintWriter( new OutputStreamWriter(new GZIPOutputStream(new BufferedOutputStream(new FileOutputStream( new File(job.get("mobius.sample.file", "./samples.txt.gz"))))))); for (Object obj : samples) { pw.println(obj); } pw.flush(); pw.close(); } //////////////////////////////////////////////// // start to write partition files //////////////////////////////////////////////// FileSystem fs = FileSystem.get(job); Path partitionFile = fs.makeQualified(new Path(TotalOrderPartitioner.getPartitionFile(job))); while (fs.exists(partitionFile)) { partitionFile = new Path(partitionFile.toString() + "." + System.currentTimeMillis()); } fs.deleteOnExit(partitionFile); TotalOrderPartitioner.setPartitionFile(job, partitionFile); LOGGER.info("write partition file to:" + partitionFile.toString()); int reducersNbr = job.getNumReduceTasks(); Set<Object> wroteSamples = new HashSet<Object>(); SequenceFile.Writer writer = SequenceFile.createWriter(fs, job, partitionFile, Tuple.class, NullWritable.class); float avgReduceSize = samples.length / reducersNbr; int lastBegin = 0; for (int i = 0; i < samples.length;) { // trying to distribute the load for every reducer evenly, // dividing the <code>samples</code> into a set of blocks // separated by boundaries, objects that selected from the // <code>samples</code> array, and each blocks should have // about the same size. // find the last index of element that equals to samples[i], as // such element might appear multiple times in the samples. int upperBound = Util.findUpperBound(samples, samples[i], comparator); int lowerBound = i;//Util.findLowerBound(samples, samples[i], comparator); // the repeat time of samples[i], if the key itself is too big // select it as boundary int currentElemSize = upperBound - lowerBound + 1; if (currentElemSize > avgReduceSize * 2) // greater than two times of average reducer size { // the current element is too big, greater than // two times of the <code>avgReduceSize</code>, // put itself as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); // immediate put the next element to the boundary, // the next element starts at <code> upperBound+1 // </code>, to prevent the current one consume even // more. if (upperBound + 1 < samples.length) { writer.append(((DataJoinKey) samples[upperBound + 1]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[upperBound + 1]).getKey()); //pw.println(samples[upperBound+1]); // move on to the next element of <code>samples[upperBound+1]/code> lastBegin = Util.findUpperBound(samples, samples[upperBound + 1], comparator) + 1; i = lastBegin; } else { break; } } else { // current element is small enough to be consider // with previous group int size = upperBound - lastBegin; if (size > avgReduceSize) { // by including the current elements, we have // found a block that's big enough, select it // as boundary writer.append(((DataJoinKey) samples[i]).getKey(), NullWritable.get()); wroteSamples.add(((DataJoinKey) samples[i]).getKey()); //pw.println(samples[i]); i = upperBound + 1; lastBegin = i; } else { i = upperBound + 1; } } } writer.close(); // if the number of wrote samples doesn't equals to number of // reducer minus one, then it means the key spaces is too small // hence TotalOrderPartitioner won't work, it works only if // the partition boundaries are distinct. // // we need to change the number of reducers if (wroteSamples.size() + 1 != reducersNbr) { LOGGER.info("Write complete, but key space is too small, sample size=" + wroteSamples.size() + ", reducer size:" + (reducersNbr)); LOGGER.info("Set the reducer size to:" + (wroteSamples.size() + 1)); // add 1 because the wrote samples define boundary, ex, if // the sample size is two with two element [300, 1000], then // there should be 3 reducers, one for handling i<300, one // for n300<=i<1000, and another one for 1000<=i job.setNumReduceTasks((wroteSamples.size() + 1)); } samples = null; } catch (IOException e) { LOGGER.error(e.getMessage(), e); throw new RuntimeException(e); } }