List of usage examples for org.apache.hadoop.mapred JobConf getOutputValueClass
public Class<?> getOutputValueClass()
From source file:de.tudarmstadt.ukp.dkpro.bigdata.hadoop.DkproReducer.java
License:Apache License
@Override public void configure(JobConf job) { super.configure(job); try {//from w w w . ja v a 2 s . c o m // create an output writable of the appropriate type outValue = (CASWritable) job.getOutputValueClass().newInstance(); } catch (Exception e) { throw new RuntimeException(e); } }
From source file:de.tudarmstadt.ukp.dkpro.bigdata.hadoop.UIMAMapReduceBase.java
License:Open Source License
@Override public void configure(JobConf job) { try {//from www. j a v a2 s . co m this.job = job; this.mapOutputValueClass = job.getMapOutputValueClass(); this.outputValueClass = job.getOutputValueClass(); this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100); final EngineFactory engineFactory = (EngineFactory) Class .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance(); engineFactory.configure(job); final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job); // replace the $dir variable within the configuration. this.fs = FileSystem.get(job); this.localFS = FileSystem.getLocal(job); this.working_dir = new Path("uima_output_" + job.get("mapred.task.id")); final Path outputPath = FileOutputFormat.getOutputPath(job); this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName())); this.localFS.mkdirs(this.results_dir); final String[] resources = job.get("dkpro.resources", "").split(","); sLogger.info("Writing local data to: " + this.results_dir); this.resourceURIs = new TreeMap<String, URL>(); for (final String resource : resources) { final URL r = job.getResource(resource); if (r != null && !resource.isEmpty()) { this.resourceURIs.put(resource, r); } } replaceRecursively(engineDescription); this.engine = createEngine(engineDescription); } catch (final Exception e) { sLogger.fatal("Error while configuring pipeline", e); e.printStackTrace(); throw new RuntimeException(e); } }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.HadoopReducerOperatorDescriptor.java
License:Apache License
public static RecordDescriptor getRecordDescriptor(JobConf conf, IHadoopClassFactory classFactory) { String outputKeyClassName = null; String outputValueClassName = null; if (conf.getUseNewMapper()) { JobContext context = new ContextFactory().createJobContext(conf); outputKeyClassName = context.getOutputKeyClass().getName(); outputValueClassName = context.getOutputValueClass().getName(); } else {//from ww w. j av a 2s . c o m outputKeyClassName = conf.getOutputKeyClass().getName(); outputValueClassName = conf.getOutputValueClass().getName(); } RecordDescriptor recordDescriptor = null; try { if (classFactory == null) { recordDescriptor = DatatypeHelper.createKeyValueRecordDescriptor( (Class<? extends Writable>) Class.forName(outputKeyClassName), (Class<? extends Writable>) Class.forName(outputValueClassName)); } else { recordDescriptor = DatatypeHelper.createKeyValueRecordDescriptor( (Class<? extends Writable>) classFactory.loadClass(outputKeyClassName), (Class<? extends Writable>) classFactory.loadClass(outputValueClassName)); } } catch (Exception e) { e.printStackTrace(); return null; } return recordDescriptor; }
From source file:org.apache.blur.spark.Consumer.java
License:Apache License
private void run() { String checkpointDirectory = "hdfs://10.252.5.113:9000/user/hadoop/spark"; // number of partition for Kafka Topic int _partitionCount = 5; List<JavaDStream<MessageAndMetadata>> streamsList = new ArrayList<JavaDStream<MessageAndMetadata>>( _partitionCount);/*from ww w . j av a 2s . c o m*/ JavaDStream<MessageAndMetadata> unionStreams; SparkConf conf = new SparkConf().setAppName("KafkaReceiver").set("spark.streaming.blockInterval", "200"); // Path to Blur Libraries . Can be copied to each Node of Spark Cluster. conf.set("spark.executor.extraClassPath", "/home/apache-blur-0.2.4/lib/*"); // Used KryoSerializer for BlurMutate and Text. conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaStreamingContext ssc = new JavaStreamingContext(conf, new Duration(3000)); /* * Receive Kafka Stream. Create individual Receivers for each Topic * Partition */ for (int i = 0; i < _partitionCount; i++) { streamsList.add(ssc.receiverStream(new KafkaReceiver(_props, i))); } /* * Union all the streams if there is more than 1 stream */ if (streamsList.size() > 1) { unionStreams = ssc.union(streamsList.get(0), streamsList.subList(1, streamsList.size())); } else { // Otherwise, just use the 1 stream unionStreams = streamsList.get(0); } /* * Generate JavaPairDStream */ JavaPairDStream<Text, BlurMutate> pairDStream = unionStreams .mapToPair(new PairFunction<MessageAndMetadata, Text, BlurMutate>() { private static final long serialVersionUID = 443235214978L; public Tuple2<Text, BlurMutate> call(MessageAndMetadata mmeta) { /* * create the BlurMutate from MessageAndMetadata */ String message = new String(mmeta.getPayload()); String keyStr = DigestUtils.shaHex(message); Text key = new Text((keyStr).getBytes()); BlurMutate mutate = new BlurMutate(BlurMutate.MUTATE_TYPE.REPLACE, keyStr, keyStr, "family"); mutate.addColumn("message", message); return new Tuple2<Text, BlurMutate>(key, mutate); } }); pairDStream.foreachRDD(new Function2<JavaPairRDD<Text, BlurMutate>, Time, Void>() { private static final long serialVersionUID = 88875777435L; @Override public Void call(JavaPairRDD<Text, BlurMutate> rdd, Time time) throws Exception { /* * Blur Table Details */ TableDescriptor tableDescriptor = new TableDescriptor(); String tableUri = new Path("hdfs://10.252.5.113:9000/blur/tables/nrt").toString(); tableDescriptor.tableUri = tableUri; tableDescriptor.cluster = "pearson"; tableDescriptor.name = "nrt"; tableDescriptor.shardCount = 9; Configuration conf = new Configuration(); /* * Partition RDD to match Blur Table Shard Count. Used * Custom Partitioner to channel correct BlurMutate to * correct Shard. */ final JavaPairRDD<Text, BlurMutate> pRdd = rdd .partitionBy(new BlurSparkPartitioner(tableDescriptor.shardCount)) .persist(StorageLevel.MEMORY_ONLY_2()); /* * Blur specific Configuration */ BlurOutputFormat.setIndexLocally(conf, false); BlurOutputFormat.setOptimizeInFlight(conf, false); conf.setClass("mapreduce.reduce.class", DefaultBlurReducer.class, Reducer.class); conf.setClass("mapreduce.outputformat.class", BlurOutputFormat.class, OutputFormat.class); conf.setClass("mapreduce.partitioner.class", BlurPartitioner.class, Partitioner.class); conf.set("mapred.output.committer.class", BlurOutputCommitter.class.getName()); conf.setInt("blur.output.max.document.buffer.size", 10000); BlurOutputFormat.setTableDescriptor(conf, tableDescriptor); JobConf jobConf = new JobConf(conf); jobConf.setNumReduceTasks(tableDescriptor.shardCount); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(BlurMutate.class); BlurMapReduceUtil.addAllJarsInBlurLib(conf); BlurMapReduceUtil.addDependencyJars(conf, org.apache.zookeeper.ZooKeeper.class, org.apache.lucene.codecs.lucene42.Lucene42Codec.class, jobConf.getOutputKeyClass(), jobConf.getOutputValueClass()); /* * Write the RDD to Blur Table */ if (pRdd.count() > 0) pRdd.saveAsNewAPIHadoopFile(tableUri, Text.class, BlurMutate.class, BlurOutputFormat.class, jobConf); return null; } }); // ssc.checkpoint(checkpointDirectory); ssc.start(); ssc.awaitTermination(); }
From source file:org.dkpro.bigdata.hadoop.UIMAMapReduceBase.java
License:Open Source License
@Override public void configure(JobConf job) { try {/* ww w . ja v a2s.c o m*/ this.job = job; this.inputName = job.get("mapred.input.dir"); this.taskId = job.get("mapred.task.id"); this.mapOutputValueClass = job.getMapOutputValueClass(); this.outputValueClass = job.getOutputValueClass(); this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100); final EngineFactory engineFactory = (EngineFactory) Class .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance(); engineFactory.configure(job); final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job); // replace the $dir variable within the configuration. this.fs = FileSystem.get(job); this.localFS = FileSystem.getLocal(job); if (job.getBoolean("dkpro.output.onedirpertask", true)) { this.working_dir = new Path("uima_output_" + job.get("mapred.task.id")); } else { this.working_dir = new Path("uima_output"); } final Path outputPath = FileOutputFormat.getOutputPath(job); this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName())); this.localFS.mkdirs(this.results_dir); final String[] resources = job.get("dkpro.resources", "").split(","); sLogger.info("Writing local data to: " + this.results_dir); this.resourceURIs = new TreeMap<String, URL>(); for (final String resource : resources) { final URL r = job.getResource(resource); if (r != null && !resource.isEmpty()) { this.resourceURIs.put(resource, r); } } Map<String, String> variableValues = new HashMap<String, String>(); variableValues.put("\\$dir", this.results_dir.toString()); variableValues.put("\\$input", this.inputName); variableValues.put("\\$taskid", this.taskId); Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job); if (cacheFiles != null) { for (Path cacheFile : cacheFiles) { variableValues.put("^\\$cache/" + cacheFile.getName(), cacheFile.toUri().getPath()); } } for (final Entry<String, URL> resource : this.resourceURIs.entrySet()) { variableValues.put("\\$" + resource, resource.getValue().toString()); } AnalysisEngineUtil.replaceVariables(engineDescription, variableValues); this.engine = createEngine(engineDescription); } catch (final Exception e) { sLogger.fatal("Error while configuring pipeline", e); e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.pentaho.hadoop.mapreduce.PentahoMapReduceBase.java
License:Apache License
@SuppressWarnings("unchecked") @Override//from w w w . j a va 2 s. c o m public void configure(JobConf job) { super.configure(job); debug = "true".equalsIgnoreCase(job.get("debug")); //$NON-NLS-1$ transMapXml = job.get("transformation-map-xml"); transCombinerXml = job.get("transformation-combiner-xml"); transReduceXml = job.get("transformation-reduce-xml"); mapInputStepName = job.get("transformation-map-input-stepname"); mapOutputStepName = job.get("transformation-map-output-stepname"); combinerInputStepName = job.get("transformation-combiner-input-stepname"); combinerOutputStepName = job.get("transformation-combiner-output-stepname"); combineSingleThreaded = isCombinerSingleThreaded(job); reduceInputStepName = job.get("transformation-reduce-input-stepname"); reduceOutputStepName = job.get("transformation-reduce-output-stepname"); reduceSingleThreaded = isReducerSingleThreaded(job); String xmlVariableSpace = job.get("variableSpace"); if (!Const.isEmpty(xmlVariableSpace)) { setDebugStatus("PentahoMapReduceBase. variableSpace was retrieved from the job. The contents: "); // deserialize from xml to variable space XStream xStream = new XStream(); if (xStream != null) { setDebugStatus("PentahoMapReduceBase: Setting classes variableSpace property.: "); variableSpace = (VariableSpace) xStream.fromXML(xmlVariableSpace); for (String variableName : variableSpace.listVariables()) { if (variableName.startsWith(KETTLE_VARIABLE_PREFIX)) { System.setProperty(variableName, variableSpace.getVariable(variableName)); } } } } else { setDebugStatus( "PentahoMapReduceBase: The PDI Job's variable space was not found in the job configuration."); variableSpace = new Variables(); } // Check for environment variables in the userDefined variables Iterator<Entry<String, String>> iter = job.iterator(); while (iter.hasNext()) { Entry<String, String> entry = iter.next(); if (entry.getKey().startsWith(ENVIRONMENT_VARIABLE_PREFIX)) { System.setProperty(entry.getKey().substring(ENVIRONMENT_VARIABLE_PREFIX.length()), entry.getValue()); } else if (entry.getKey().startsWith(KETTLE_VARIABLE_PREFIX)) { System.setProperty(entry.getKey(), entry.getValue()); } } MRUtil.passInformationToTransformation(variableSpace, job); switch (mrOperation) { case Combine: outClassK = (Class<K>) job.getMapOutputKeyClass(); outClassV = (Class<V>) job.getMapOutputValueClass(); break; case Reduce: outClassK = (Class<K>) job.getOutputKeyClass(); outClassV = (Class<V>) job.getOutputValueClass(); break; default: throw new IllegalArgumentException("Unsupported MapReduce operation: " + mrOperation); } if (debug) { System.out.println("Job configuration>"); System.out.println("Output key class: " + outClassK.getName()); System.out.println("Output value class: " + outClassV.getName()); } // set the log level to what the level of the job is String stringLogLevel = job.get("logLevel"); if (!Const.isEmpty(stringLogLevel)) { logLevel = LogLevel.valueOf(stringLogLevel); setDebugStatus("Log level set to " + stringLogLevel); } else { System.out.println( "Could not retrieve the log level from the job configuration. logLevel will not be set."); } createTrans(job); }
From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java
License:Open Source License
@Test public void testReducerOutputClasses() throws IOException, KettleException { JobConf jobConf = createJobConf("./test-res/wordcount-mapper.ktr", "./test-res/wordcount-reducer.ktr", "./test-res/wordcount-reducer.ktr"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputValueClass(NullWritable.class); jobConf.setOutputValueClass(LongWritable.class); GenericTransReduce reducer = new GenericTransReduce(); reducer.configure(jobConf);/*w w w .j a v a 2s. c o m*/ assertEquals(jobConf.getOutputKeyClass(), reducer.getOutClassK()); assertEquals(jobConf.getOutputValueClass(), reducer.getOutClassV()); }
From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIntegrationTest.java
License:Apache License
@Test public void testReducerOutputClasses() throws IOException, KettleException { JobConf jobConf = createJobConf("./src/test/resources/wordcount-mapper.ktr", "./src/test/resources/wordcount-reducer.ktr", "./src/test/resources/wordcount-reducer.ktr"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputValueClass(NullWritable.class); jobConf.setOutputValueClass(LongWritable.class); GenericTransReduce reducer = new GenericTransReduce(); reducer.configure(jobConf);/* w w w.j av a 2 s .c o m*/ assertEquals(jobConf.getOutputKeyClass(), reducer.getOutClassK()); assertEquals(jobConf.getOutputValueClass(), reducer.getOutClassV()); }
From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIT.java
License:Apache License
@Test public void testReducerOutputClasses() throws IOException, KettleException { JobConf jobConf = createJobConf("./src/it/resources/wordcount-mapper.ktr", "./src/it/resources/wordcount-reducer.ktr", "./src/it/resources/wordcount-reducer.ktr"); jobConf.setMapOutputKeyClass(Text.class); jobConf.setMapOutputValueClass(IntWritable.class); jobConf.setOutputValueClass(NullWritable.class); jobConf.setOutputValueClass(LongWritable.class); GenericTransReduce reducer = new GenericTransReduce(); reducer.configure(jobConf);/*from ww w . ja v a2s . co m*/ assertEquals(jobConf.getOutputKeyClass(), reducer.getOutClassK()); assertEquals(jobConf.getOutputValueClass(), reducer.getOutClassV()); }
From source file:tap.core.ReducerBridge.java
License:Apache License
@Override public void configure(JobConf conf) { super.configure(conf); isTextOutput = conf.getOutputFormat() instanceof TextOutputFormat; isProtoOutput = conf.getOutputFormat() instanceof TapfileOutputFormat; if (isProtoOutput) { try {/*from w w w . java 2 s. c o m*/ mapOutClass = Class.forName(conf.get(Phase.MAP_OUT_CLASS)); reduceOutClass = Class.forName(conf.get(Phase.REDUCE_OUT_CLASS)); if (mapOutClass != reduceOutClass) { reduceOutKeyChanges = true; String groupBy = conf.get(Phase.GROUP_BY); String sortBy = conf.get(Phase.SORT_BY); reduceOutSchema = ReflectUtils.getSchema(ObjectFactory.newInstance(reduceOutClass)); extractor = ReflectionKeyExtractor.getReflectionKeyExtractorForReduceOutKey(reduceOutSchema, groupBy, sortBy); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } multiOutputPrefix = conf.get(Phase.MULTIPLE_OUTPUT_PREFIX); if (multiOutputPrefix == null) multiOutputPrefix = "out"; MultipleOutputs.addMultiNamedOutput(conf, multiOutputPrefix, conf.getOutputFormat().getClass(), conf.getOutputKeyClass(), conf.getOutputValueClass()); this.multiOutput = new MultipleOutputs(conf); }