Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults)

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:cascading.avro.TrevniSchemeTest.java

License:Apache License

@Test
public void testSpecifiedColumns() throws Exception {

    final Schema schema = new Schema.Parser()
            .parse(getClass().getResourceAsStream("electric-power-usage.avsc"));

    final Schema specifiedColumnsSchema = new Schema.Parser()
            .parse(getClass().getResourceAsStream("electric-power-usage2.avsc"));

    Configuration hadoopConf = new Configuration();

    // compression codec for trevni column block.
    // KKr - This fails on systems without Snappy installed, so commenting it out
    // hadoopConf.set("trevni.meta.trevni.codec", "snappy");

    Map<Object, Object> confMap = new HashMap<Object, Object>();
    Iterator<Entry<String, String>> iter = hadoopConf.iterator();
    while (iter.hasNext()) {
        Entry<String, String> entry = iter.next();
        confMap.put(entry.getKey(), entry.getValue());
    }/*from w  w  w . j a v a  2 s  . c  o m*/

    JobConf jobConf = new JobConf(hadoopConf);

    String in = tempDir.getRoot().toString() + "/specifiedColumns/in";
    String out = tempDir.getRoot().toString() + "/specifiedColumns/out";

    final Fields fields = new Fields("addressCode", "timestamp", "devicePowerEventList");

    final Fields innerFields = new Fields("power", "deviceType", "deviceId", "status");

    Tap lfsSource = new Lfs(new TrevniScheme(schema), in, SinkMode.REPLACE);

    TupleEntryCollector write = lfsSource.openForWrite(new HadoopFlowProcess(jobConf));

    List<TupleEntry> devicePowerEventList = new ArrayList<TupleEntry>();
    devicePowerEventList.add(new TupleEntry(innerFields, new Tuple(1300.0, 5, 0, 1)));
    devicePowerEventList.add(new TupleEntry(innerFields, new Tuple(3500.4, 4, 1, 0)));

    List<TupleEntry> devicePowerEventList2 = new ArrayList<TupleEntry>();
    devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(3570.0, 3, 0, 1)));
    devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(110.4, 2, 1, 0)));
    devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(250.9, 3, 3, 1)));

    write.add(new TupleEntry(fields, new Tuple("4874025000-514", 1356998460000L, devicePowerEventList)));
    write.add(new TupleEntry(fields, new Tuple("4725033000-4031", 1356998520000L, devicePowerEventList2)));

    write.close();

    Pipe writePipe = new Pipe("tuples to trevni");
    Tap lfsTrevniSource = new Lfs(new TrevniScheme(schema), in + "/*");
    Tap trevniSink = new Lfs(new TrevniScheme(schema), out);

    Flow flow = new HadoopFlowConnector(confMap).connect(lfsTrevniSource, trevniSink, writePipe);
    flow.complete();

    // Read the specified columns.      
    Tap trevniSource = new Lfs(new TrevniScheme(specifiedColumnsSchema), out + "/*");

    TupleEntryIterator iterator = trevniSource.openForRead(new HadoopFlowProcess(jobConf));

    assertTrue(iterator.hasNext());

    final TupleEntry readEntry1 = iterator.next();

    assertTrue(readEntry1.getString("addressCode").equals("4874025000-514"));
    assertEquals(2, ((List) readEntry1.getObject("devicePowerEventList")).size());
    assertEquals(1300.0, ((Tuple) ((List) readEntry1.getObject("devicePowerEventList")).get(0)).getDouble(0));

    final TupleEntry readEntry2 = iterator.next();

    assertTrue(readEntry2.getString("addressCode").equals("4725033000-4031"));
    assertEquals(3, ((List) readEntry2.getObject("devicePowerEventList")).size());
    assertEquals(110.4, ((Tuple) ((List) readEntry1.getObject("devicePowerEventList")).get(1)).getDouble(0));
}

From source file:cascading.ClusterTestCase.java

License:Open Source License

public JobConf getJobConf() {
    return new JobConf(jobConf);
}

From source file:cascading.flow.Flow.java

License:Open Source License

private void setJobConf(JobConf jobConf) {
    if (jobConf == null) // this is ok, getJobConf will pass a default parent in
        return;//from w  w w  .j a va2s . c o m

    this.jobConf = new JobConf(jobConf); // prevent local values from being shared
    this.jobConf.set("fs.http.impl", HttpFileSystem.class.getName());
    this.jobConf.set("fs.https.impl", HttpFileSystem.class.getName());
    this.jobConf.set("fs.s3tp.impl", S3HttpFileSystem.class.getName());

    // set the ID for future reference
    this.jobConf.set("cascading.flow.id", getID());
}

From source file:cascading.flow.Flow.java

License:Open Source License

/**
 * Method areSourcesNewer returns true if any source is newer than the given sinkModified date value.
 *
 * @param sinkModified of type long/*w w w. j  av a  2s.  c om*/
 * @return boolean
 * @throws IOException when
 */
public boolean areSourcesNewer(long sinkModified) throws IOException {
    JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident
    long sourceMod = 0;

    try {
        for (Tap source : sources.values()) {
            if (!source.pathExists(confCopy))
                throw new FlowException("source does not exist: " + source);

            sourceMod = source.getPathModified(confCopy);

            if (sinkModified < sourceMod)
                return true;
        }

        return false;
    } finally {
        if (LOG.isInfoEnabled())
            logInfo("source modification date at: " + new Date(sourceMod)); // not oldest, we didnt check them all
    }
}

From source file:cascading.flow.Flow.java

License:Open Source License

/**
 * Method getSinkModified returns the youngest modified date of any sink {@link Tap} managed by this Flow instance.
 * <p/>//from  ww  w  . ja  va2 s. c  o  m
 * If zero (0) is returned, atleast one of the sink resources does not exist. If minus one (-1) is returned,
 * atleast one of the sinks are marked for delete ({@link Tap#isReplace() returns true}).
 *
 * @return the sinkModified (type long) of this Flow object.
 * @throws IOException when
 */
public long getSinkModified() throws IOException {
    JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident
    long sinkModified = Long.MAX_VALUE;

    for (Tap sink : sinks.values()) {
        if (sink.isReplace() || sink.isUpdate())
            sinkModified = -1L;
        else {
            if (!sink.pathExists(confCopy))
                sinkModified = 0L;
            else
                sinkModified = Math.min(sinkModified, sink.getPathModified(confCopy)); // return youngest mod date
        }
    }

    if (LOG.isInfoEnabled()) {
        if (sinkModified == -1L)
            logInfo("atleast one sink is marked for delete");
        if (sinkModified == 0L)
            logInfo("atleast one sink does not exist");
        else
            logInfo("sink oldest modified date: " + new Date(sinkModified));
    }

    return sinkModified;
}

From source file:cascading.flow.FlowStep.java

License:Open Source License

private void initFromTraps(JobConf conf, Map<String, Tap> traps) throws IOException {
    if (!traps.isEmpty()) {
        JobConf trapConf = new JobConf(conf);

        for (Tap tap : traps.values())
            tap.sinkInit(trapConf);/*from   ww w .ja v a 2  s  . co  m*/
    }
}

From source file:cascading.flow.FlowStep.java

License:Open Source License

private void initFromSources(JobConf conf) throws IOException {
    JobConf[] fromJobs = new JobConf[sources.size()];
    int i = 0;//from   w ww. j  a v a  2  s .c  o  m

    for (Tap tap : sources.keySet()) {
        fromJobs[i] = new JobConf(conf);
        tap.sourceInit(fromJobs[i]);
        fromJobs[i].set("cascading.step.source", Util.serializeBase64(tap));
        i++;
    }

    MultiInputFormat.addInputFormat(conf, fromJobs);
}

From source file:cascading.flow.hadoop.HadoopStepStats.java

License:Open Source License

public void captureJobStats() {
    RunningJob runningJob = getRunningJob();

    if (runningJob == null)
        return;//from www  .  j a v a  2s .c  om

    JobConf ranJob = new JobConf(runningJob.getJobFile());

    setNumMapTasks(ranJob.getNumMapTasks());
    setNumReducerTasks(ranJob.getNumReduceTasks());
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testFlow() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration();

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));

    String outputPath = getOutputPath("flowTest");
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)),
            10);// w ww . j  a  v  a  2  s. co m

    flow.complete();

    validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testCascade() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    String sinkPath4 = getOutputPath("flow4");
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE);
    Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1,
            new Pipe("first-flow"));

    String sinkPath5 = getOutputPath("flow5");
    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE);
    Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2,
            new Pipe("second-flow"));

    JobConf defaultConf = HadoopPlanner.createJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true)));
    String sinkPath1 = getOutputPath("flow1");
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true)));
    String sinkPath2 = getOutputPath("flow2");
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    Job job = new Job(defaultConf);
    job.setJobName("third-mr");

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class);

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.getConfiguration().set("mapred.mapper.new-api", "true");
    job.getConfiguration().set("mapred.reducer.new-api", "true");

    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true)));
    String sinkPath3 = getOutputPath("flow3");
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job,
            new Path(remove(sinkPath3, true)));

    Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    cascade.complete();// ww  w  .  jav  a2  s  .  co m

    validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}