List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:com.datatorrent.demos.mroperator.MapOperator.java
License:Open Source License
@Override public void beginWindow(long windowId) { if (!emitPartitioningCountOnce) { outputCount.emit(new KeyHashValPair<Integer, Integer>(operatorId, 1)); emitPartitioningCountOnce = true; }//from w ww . j a v a2 s. c o m if (reader == null) { try { reader = inputFormat.getRecordReader(inputSplit, new JobConf(new Configuration()), reporter); } catch (IOException e) { logger.info("error getting record reader {}", e.getMessage()); } } super.beginWindow(windowId); }
From source file:com.datatorrent.demos.mroperator.MapOperator.java
License:Open Source License
@Override public void setup(OperatorContext context) { if (context != null) { operatorId = context.getId();//from w ww . j a v a 2 s . com } reporter = new ReporterImpl(ReporterType.Mapper, new Counters()); outputCollector = new OutputCollectorImpl<K2, V2>(); Configuration conf = new Configuration(); try { inputFormat = inputFormatClass.newInstance(); SerializationFactory serializationFactory = new SerializationFactory(conf); Deserializer keyDesiralizer = serializationFactory.getDeserializer(inputSplitClass); keyDesiralizer.open(new ByteArrayInputStream(outstream.toByteArray())); inputSplit = (InputSplit) keyDesiralizer.deserialize(null); ((ReporterImpl) reporter).setInputSplit(inputSplit); reader = inputFormat.getRecordReader(inputSplit, new JobConf(conf), reporter); } catch (Exception e) { logger.info("failed to initialize inputformat obj {}", inputFormat); throw new RuntimeException(e); } InputStream stream = null; if (configFile != null && configFile.length() > 0) { stream = ClassLoader.getSystemResourceAsStream("/" + configFile); if (stream == null) { stream = ClassLoader.getSystemResourceAsStream(configFile); } } if (stream != null) { conf.addResource(stream); } jobConf = new JobConf(conf); if (mapClass != null) { try { mapObject = mapClass.newInstance(); } catch (Exception e) { logger.info("can't instantiate object {}", e.getMessage()); } mapObject.configure(jobConf); } if (combineClass != null) { try { combineObject = combineClass.newInstance(); } catch (Exception e) { logger.info("can't instantiate object {}", e.getMessage()); } combineObject.configure(jobConf); } }
From source file:com.datatorrent.demos.mroperator.MapOperator.java
License:Open Source License
@SuppressWarnings("rawtypes") @Override/* ww w .j a va 2 s .co m*/ public Collection<Partition<MapOperator<K1, V1, K2, V2>>> definePartitions( Collection<Partition<MapOperator<K1, V1, K2, V2>>> partitions, int incrementalCapacity) { Collection c = partitions; Collection<Partition<MapOperator<K1, V1, K2, V2>>> operatorPartitions = c; Partition<MapOperator<K1, V1, K2, V2>> template = null; Iterator<Partition<MapOperator<K1, V1, K2, V2>>> itr = operatorPartitions.iterator(); template = itr.next(); Configuration conf = new Configuration(); SerializationFactory serializationFactory = new SerializationFactory(conf); if (outstream.size() == 0) { InputSplit[] splits; try { splits = getSplits(new JobConf(conf), incrementalCapacity + 1, template.getPartitionedInstance().getDirName()); } catch (Exception e1) { logger.info(" can't get splits {}", e1.getMessage()); throw new RuntimeException(e1); } Collection<Partition<MapOperator<K1, V1, K2, V2>>> operList = new ArrayList<Partition<MapOperator<K1, V1, K2, V2>>>(); itr = operatorPartitions.iterator(); int size = splits.length; Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass()); while (size > 0 && itr.hasNext()) { Partition<MapOperator<K1, V1, K2, V2>> p = itr.next(); MapOperator<K1, V1, K2, V2> opr = p.getPartitionedInstance(); opr.setInputFormatClass(inputFormatClass); opr.setMapClass(mapClass); opr.setCombineClass(combineClass); opr.setConfigFile(configFile); try { keySerializer.open(opr.getOutstream()); keySerializer.serialize(splits[size - 1]); opr.setInputSplitClass(splits[size - 1].getClass()); } catch (IOException e) { logger.info("error while serializing {}", e.getMessage()); } size--; operList.add(p); } while (size > 0) { MapOperator<K1, V1, K2, V2> opr = new MapOperator<K1, V1, K2, V2>(); opr.setInputFormatClass(inputFormatClass); opr.setMapClass(mapClass); opr.setCombineClass(combineClass); opr.setConfigFile(configFile); try { keySerializer.open(opr.getOutstream()); keySerializer.serialize(splits[size - 1]); opr.setInputSplitClass(splits[size - 1].getClass()); } catch (IOException e) { logger.info("error while serializing {}", e.getMessage()); } size--; operList.add(new DefaultPartition<MapOperator<K1, V1, K2, V2>>(opr)); } try { keySerializer.close(); } catch (IOException e) { throw new RuntimeException(e); } return operList; } return null; }
From source file:com.datatorrent.demos.mroperator.MapOperatorTest.java
License:Open Source License
public void testNodeProcessingSchema(MapOperator<LongWritable, Text, Text, IntWritable> oper) throws IOException { CollectorTestSink sortSink = new CollectorTestSink(); oper.output.setSink(sortSink);//w ww . j av a 2 s . c o m oper.setMapClass(WordCount.Map.class); oper.setCombineClass(WordCount.Reduce.class); oper.setDirName("src/test/resources/mroperator/"); oper.setConfigFile(null); oper.setInputFormatClass(TextInputFormat.class); Configuration conf = new Configuration(); JobConf jobConf = new JobConf(conf); FileInputFormat.setInputPaths(jobConf, new Path("src/test/resources/mroperator/")); TextInputFormat inputFormat = new TextInputFormat(); inputFormat.configure(jobConf); InputSplit[] splits = inputFormat.getSplits(jobConf, 1); SerializationFactory serializationFactory = new SerializationFactory(conf); Serializer keySerializer = serializationFactory.getSerializer(splits[0].getClass()); keySerializer.open(oper.getOutstream()); keySerializer.serialize(splits[0]); oper.setInputSplitClass(splits[0].getClass()); keySerializer.close(); oper.setup(null); oper.beginWindow(0); oper.emitTuples(); oper.emitTuples(); oper.endWindow(); oper.beginWindow(1); oper.emitTuples(); oper.endWindow(); Assert.assertEquals("number emitted tuples", 6, sortSink.collectedTuples.size()); for (Object o : sortSink.collectedTuples) { logger.debug(o.toString()); } logger.debug("Done testing round\n"); }
From source file:com.datatorrent.demos.mroperator.ReduceOperator.java
License:Open Source License
@Override public void setup(OperatorContext context) { reporter = new ReporterImpl(ReporterType.Reducer, new Counters()); if (context != null) { operatorId = context.getId();// w ww. j a va 2s . co m } cacheObject = new HashMap<K1, List<V1>>(); outputCollector = new OutputCollectorImpl<K2, V2>(); if (reduceClass != null) { try { reduceObj = reduceClass.newInstance(); } catch (Exception e) { logger.info("can't instantiate object {}", e.getMessage()); throw new RuntimeException(e); } Configuration conf = new Configuration(); InputStream stream = null; if (configFile != null && configFile.length() > 0) { logger.info("system /{}", configFile); stream = ClassLoader.getSystemResourceAsStream("/" + configFile); if (stream == null) { logger.info("system {}", configFile); stream = ClassLoader.getSystemResourceAsStream(configFile); } } if (stream != null) { logger.info("found our stream... so adding it"); conf.addResource(stream); } reduceObj.configure(new JobConf(conf)); } }
From source file:com.datatorrent.demos.mroperator.WordCount.java
License:Open Source License
public void run(String[] args) throws Exception { JobConf conf = new JobConf(this.getClass()); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(Map.class); conf.setCombinerClass(Reduce.class); conf.setReducerClass(Reduce.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(args[0])); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf);//w w w . java 2 s. c om }
From source file:com.digitalpebble.behemoth.ClassifierJob.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); options.addOption("m", "model", true, "location of the model"); // parse the command line arguments CommandLine line = null;//from w ww . j a v a 2 s. co m try { line = parser.parse(options, args); String input = line.getOptionValue("i"); String output = line.getOptionValue("o"); String model = line.getOptionValue("m"); if (line.hasOption("help")) { formatter.printHelp("ClassifierJob", options); return 0; } if (model == null | input == null | output == null) { formatter.printHelp("ClassifierJob", options); return -1; } } catch (ParseException e) { formatter.printHelp("ClassifierJob", options); } final FileSystem fs = FileSystem.get(getConf()); Path inputPath = new Path(line.getOptionValue("i")); Path outputPath = new Path(line.getOptionValue("o")); String modelPath = line.getOptionValue("m"); JobConf job = new JobConf(getConf()); // push the model file to the DistributedCache DistributedCache.addCacheArchive(new URI(modelPath), job); job.setJarByClass(this.getClass()); job.setJobName("ClassifierJob : " + inputPath.toString()); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BehemothDocument.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(TextClassifierMapper.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.set(modelNameParam, modelPath); try { JobClient.runJob(job); } catch (Exception e) { e.printStackTrace(); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.commoncrawl.CorpusMerger.java
License:Apache License
public int run(String[] args) throws Exception { Options options = new Options(); // automatically generate the help statement HelpFormatter formatter = new HelpFormatter(); // create the parser CommandLineParser parser = new GnuParser(); options.addOption("h", "help", false, "print this message"); options.addOption("i", "input", true, "input Behemoth corpus"); options.addOption("o", "output", true, "output Behemoth corpus"); // parse the command line arguments CommandLine line = null;/*from w ww.j a v a 2s . c o m*/ try { line = parser.parse(options, args); String input = line.getOptionValue("i"); if (line.hasOption("help")) { formatter.printHelp("CorpusMerger", options); return 0; } if (input == null) { formatter.printHelp("CorpusMerger", options); return -1; } } catch (ParseException e) { formatter.printHelp("CorpusMerger", options); return -1; } Path outputPath = new Path(line.getOptionValue("o")); String[] paths = (line.getOptionValues("i")); JobConf job = new JobConf(getConf()); // MUST not forget the line below job.setJarByClass(this.getClass()); job.setJobName("CorpusMerger"); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BehemothDocument.class); //job.setMapperClass(IdentityMapper.class); job.setReducerClass(MergerReducer.class); for (String in : paths) FileInputFormat.addInputPath(job, new Path(in)); FileOutputFormat.setOutputPath(job, outputPath); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("CorpusMerger completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception caught", e); // fs.delete(outputPath, true); } finally { } return 0; }
From source file:com.digitalpebble.behemoth.es.ESIndexerJob.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 1) { String syntax = "com.digitalpebble.behemoth.ESIndexerJob input"; System.err.println(syntax); return -1; }/* w w w.j av a2s.com*/ Path inputPath = new Path(args[0]); JobConf job = new JobConf(getConf()); job.setJarByClass(this.getClass()); job.setJobName("Indexing " + inputPath + " into ElasticSearch"); job.setInputFormat(SequenceFileInputFormat.class); job.setMapOutputValueClass(MapWritable.class); job.setMapperClass(BehemothToESMapper.class); job.setSpeculativeExecution(false); // disable speculative execution // when writing to ES // job.set("es.resource", "radio/artists"); // index used for storing // data job.setOutputFormat(EsOutputFormat.class); // use dedicated output // format FileInputFormat.addInputPath(job, inputPath); // no reducer : send straight to elasticsearch at end of mapping job.setNumReduceTasks(0); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("ESIndexerJob completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception while running job", e); return -1; } return 0; }
From source file:com.digitalpebble.behemoth.gate.GATEDriver.java
License:Apache License
public int run(String[] args) throws Exception { final FileSystem fs = FileSystem.get(getConf()); if (args.length < 3 | args.length > 4) { String syntax = "com.digitalpebble.behemoth.gate.GATEDriver in out path_gate_file [-XML]"; System.err.println(syntax); return -1; }/* w ww . j a v a2 s . c o m*/ boolean dumpGATEXML = false; for (String arg : args) { if (arg.equalsIgnoreCase("-xml")) dumpGATEXML = true; } Path inputPath = new Path(args[0]); Path outputPath = new Path(args[1]); String zip_application_path = args[2]; // check that the GATE application has been stored on HDFS Path zap = new Path(zip_application_path); if (fs.exists(zap) == false) { System.err .println("The GATE application " + zip_application_path + "can't be found on HDFS - aborting"); return -1; } JobConf job = new JobConf(getConf()); // MUST not forget the line below job.setJarByClass(this.getClass()); job.setJobName("Processing " + args[0] + " with GATE application from " + zip_application_path); job.setInputFormat(SequenceFileInputFormat.class); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(Text.class); if (dumpGATEXML) { job.setOutputValueClass(Text.class); job.setMapperClass(GATEXMLMapper.class); } else { job.setOutputValueClass(BehemothDocument.class); job.setMapperClass(GATEMapper.class); } // detect if any filters have been defined // and activate the reducer accordingly boolean isFilterRequired = BehemothReducer.isRequired(job); if (isFilterRequired) job.setReducerClass(BehemothReducer.class); else { job.setNumReduceTasks(0); } FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // push the zipped_gate_application onto the DistributedCache DistributedCache.addCacheArchive(new URI(zip_application_path), job); job.set("gate.application.path", zip_application_path.toString()); try { long start = System.currentTimeMillis(); JobClient.runJob(job); long finish = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("GATEDriver completed. Timing: " + (finish - start) + " ms"); } } catch (Exception e) { LOG.error("Exception caught", e); // leave even partial output // fs.delete(outputPath, true); } finally { } return 0; }