List of usage examples for org.apache.hadoop.mapred JobConf JobConf
public JobConf(boolean loadDefaults)
From source file:cascading.avro.TrevniSchemeTest.java
License:Apache License
@Test public void testSpecifiedColumns() throws Exception { final Schema schema = new Schema.Parser() .parse(getClass().getResourceAsStream("electric-power-usage.avsc")); final Schema specifiedColumnsSchema = new Schema.Parser() .parse(getClass().getResourceAsStream("electric-power-usage2.avsc")); Configuration hadoopConf = new Configuration(); // compression codec for trevni column block. // KKr - This fails on systems without Snappy installed, so commenting it out // hadoopConf.set("trevni.meta.trevni.codec", "snappy"); Map<Object, Object> confMap = new HashMap<Object, Object>(); Iterator<Entry<String, String>> iter = hadoopConf.iterator(); while (iter.hasNext()) { Entry<String, String> entry = iter.next(); confMap.put(entry.getKey(), entry.getValue()); }/*from w w w . j a v a 2 s . c o m*/ JobConf jobConf = new JobConf(hadoopConf); String in = tempDir.getRoot().toString() + "/specifiedColumns/in"; String out = tempDir.getRoot().toString() + "/specifiedColumns/out"; final Fields fields = new Fields("addressCode", "timestamp", "devicePowerEventList"); final Fields innerFields = new Fields("power", "deviceType", "deviceId", "status"); Tap lfsSource = new Lfs(new TrevniScheme(schema), in, SinkMode.REPLACE); TupleEntryCollector write = lfsSource.openForWrite(new HadoopFlowProcess(jobConf)); List<TupleEntry> devicePowerEventList = new ArrayList<TupleEntry>(); devicePowerEventList.add(new TupleEntry(innerFields, new Tuple(1300.0, 5, 0, 1))); devicePowerEventList.add(new TupleEntry(innerFields, new Tuple(3500.4, 4, 1, 0))); List<TupleEntry> devicePowerEventList2 = new ArrayList<TupleEntry>(); devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(3570.0, 3, 0, 1))); devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(110.4, 2, 1, 0))); devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(250.9, 3, 3, 1))); write.add(new TupleEntry(fields, new Tuple("4874025000-514", 1356998460000L, devicePowerEventList))); write.add(new TupleEntry(fields, new Tuple("4725033000-4031", 1356998520000L, devicePowerEventList2))); write.close(); Pipe writePipe = new Pipe("tuples to trevni"); Tap lfsTrevniSource = new Lfs(new TrevniScheme(schema), in + "/*"); Tap trevniSink = new Lfs(new TrevniScheme(schema), out); Flow flow = new HadoopFlowConnector(confMap).connect(lfsTrevniSource, trevniSink, writePipe); flow.complete(); // Read the specified columns. Tap trevniSource = new Lfs(new TrevniScheme(specifiedColumnsSchema), out + "/*"); TupleEntryIterator iterator = trevniSource.openForRead(new HadoopFlowProcess(jobConf)); assertTrue(iterator.hasNext()); final TupleEntry readEntry1 = iterator.next(); assertTrue(readEntry1.getString("addressCode").equals("4874025000-514")); assertEquals(2, ((List) readEntry1.getObject("devicePowerEventList")).size()); assertEquals(1300.0, ((Tuple) ((List) readEntry1.getObject("devicePowerEventList")).get(0)).getDouble(0)); final TupleEntry readEntry2 = iterator.next(); assertTrue(readEntry2.getString("addressCode").equals("4725033000-4031")); assertEquals(3, ((List) readEntry2.getObject("devicePowerEventList")).size()); assertEquals(110.4, ((Tuple) ((List) readEntry1.getObject("devicePowerEventList")).get(1)).getDouble(0)); }
From source file:cascading.ClusterTestCase.java
License:Open Source License
public JobConf getJobConf() { return new JobConf(jobConf); }
From source file:cascading.flow.Flow.java
License:Open Source License
private void setJobConf(JobConf jobConf) { if (jobConf == null) // this is ok, getJobConf will pass a default parent in return;//from w w w .j a va2s . c o m this.jobConf = new JobConf(jobConf); // prevent local values from being shared this.jobConf.set("fs.http.impl", HttpFileSystem.class.getName()); this.jobConf.set("fs.https.impl", HttpFileSystem.class.getName()); this.jobConf.set("fs.s3tp.impl", S3HttpFileSystem.class.getName()); // set the ID for future reference this.jobConf.set("cascading.flow.id", getID()); }
From source file:cascading.flow.Flow.java
License:Open Source License
/** * Method areSourcesNewer returns true if any source is newer than the given sinkModified date value. * * @param sinkModified of type long/*w w w. j av a 2s. c om*/ * @return boolean * @throws IOException when */ public boolean areSourcesNewer(long sinkModified) throws IOException { JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident long sourceMod = 0; try { for (Tap source : sources.values()) { if (!source.pathExists(confCopy)) throw new FlowException("source does not exist: " + source); sourceMod = source.getPathModified(confCopy); if (sinkModified < sourceMod) return true; } return false; } finally { if (LOG.isInfoEnabled()) logInfo("source modification date at: " + new Date(sourceMod)); // not oldest, we didnt check them all } }
From source file:cascading.flow.Flow.java
License:Open Source License
/** * Method getSinkModified returns the youngest modified date of any sink {@link Tap} managed by this Flow instance. * <p/>//from ww w . ja va2 s. c o m * If zero (0) is returned, atleast one of the sink resources does not exist. If minus one (-1) is returned, * atleast one of the sinks are marked for delete ({@link Tap#isReplace() returns true}). * * @return the sinkModified (type long) of this Flow object. * @throws IOException when */ public long getSinkModified() throws IOException { JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident long sinkModified = Long.MAX_VALUE; for (Tap sink : sinks.values()) { if (sink.isReplace() || sink.isUpdate()) sinkModified = -1L; else { if (!sink.pathExists(confCopy)) sinkModified = 0L; else sinkModified = Math.min(sinkModified, sink.getPathModified(confCopy)); // return youngest mod date } } if (LOG.isInfoEnabled()) { if (sinkModified == -1L) logInfo("atleast one sink is marked for delete"); if (sinkModified == 0L) logInfo("atleast one sink does not exist"); else logInfo("sink oldest modified date: " + new Date(sinkModified)); } return sinkModified; }
From source file:cascading.flow.FlowStep.java
License:Open Source License
private void initFromTraps(JobConf conf, Map<String, Tap> traps) throws IOException { if (!traps.isEmpty()) { JobConf trapConf = new JobConf(conf); for (Tap tap : traps.values()) tap.sinkInit(trapConf);/*from ww w .ja v a 2 s . co m*/ } }
From source file:cascading.flow.FlowStep.java
License:Open Source License
private void initFromSources(JobConf conf) throws IOException { JobConf[] fromJobs = new JobConf[sources.size()]; int i = 0;//from w ww. j a v a 2 s .c o m for (Tap tap : sources.keySet()) { fromJobs[i] = new JobConf(conf); tap.sourceInit(fromJobs[i]); fromJobs[i].set("cascading.step.source", Util.serializeBase64(tap)); i++; } MultiInputFormat.addInputFormat(conf, fromJobs); }
From source file:cascading.flow.hadoop.HadoopStepStats.java
License:Open Source License
public void captureJobStats() { RunningJob runningJob = getRunningJob(); if (runningJob == null) return;//from www . j a v a 2s .c om JobConf ranJob = new JobConf(runningJob.getJobFile()); setNumMapTasks(ranJob.getNumMapTasks()); setNumReducerTasks(ranJob.getNumReduceTasks()); }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testFlow() throws IOException { getPlatform().copyFromLocal(inputFileApache); JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration(); JobConf conf = new JobConf(defaultConf); conf.setJobName("mrflow"); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path(inputFileApache)); String outputPath = getOutputPath("flowTest"); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); Flow flow = new MapReduceFlow("mrflow", conf, true); validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)), 10);// w ww . j a v a 2 s. co m flow.complete(); validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10); }
From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java
License:Open Source License
@Test public void testCascade() throws IOException { getPlatform().copyFromLocal(inputFileApache); // Setup two standard cascading flows that will generate the input for the first MapReduceFlow Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false)); String sinkPath4 = getOutputPath("flow4"); Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE); Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1, new Pipe("first-flow")); String sinkPath5 = getOutputPath("flow5"); Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE); Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2, new Pipe("second-flow")); JobConf defaultConf = HadoopPlanner.createJobConf(getProperties()); JobConf firstConf = new JobConf(defaultConf); firstConf.setJobName("first-mr"); firstConf.setOutputKeyClass(LongWritable.class); firstConf.setOutputValueClass(Text.class); firstConf.setMapperClass(IdentityMapper.class); firstConf.setReducerClass(IdentityReducer.class); firstConf.setInputFormat(TextInputFormat.class); firstConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true))); String sinkPath1 = getOutputPath("flow1"); FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true))); Flow firstMR = new MapReduceFlow(firstConf, true); JobConf secondConf = new JobConf(defaultConf); secondConf.setJobName("second-mr"); secondConf.setOutputKeyClass(LongWritable.class); secondConf.setOutputValueClass(Text.class); secondConf.setMapperClass(IdentityMapper.class); secondConf.setReducerClass(IdentityReducer.class); secondConf.setInputFormat(TextInputFormat.class); secondConf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true))); String sinkPath2 = getOutputPath("flow2"); FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true))); Flow secondMR = new MapReduceFlow(secondConf, true); Job job = new Job(defaultConf); job.setJobName("third-mr"); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(Text.class); job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class); job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class); job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class); job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class); job.getConfiguration().set("mapred.mapper.new-api", "true"); job.getConfiguration().set("mapred.reducer.new-api", "true"); org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true))); String sinkPath3 = getOutputPath("flow3"); org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job, new Path(remove(sinkPath3, true))); Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true); CascadeConnector cascadeConnector = new CascadeConnector(); // pass out of order Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR); cascade.complete();// ww w . jav a2 s . co m validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10); }