Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults) 

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:cascading.avro.TrevniSchemeTest.java

License:Apache License

@Test
public void testSpecifiedColumns() throws Exception {

    final Schema schema = new Schema.Parser()
            .parse(getClass().getResourceAsStream("electric-power-usage.avsc"));

    final Schema specifiedColumnsSchema = new Schema.Parser()
            .parse(getClass().getResourceAsStream("electric-power-usage2.avsc"));

    Configuration hadoopConf = new Configuration();

    // compression codec for trevni column block.
    // KKr - This fails on systems without Snappy installed, so commenting it out
    // hadoopConf.set("trevni.meta.trevni.codec", "snappy");

    Map<Object, Object> confMap = new HashMap<Object, Object>();
    Iterator<Entry<String, String>> iter = hadoopConf.iterator();
    while (iter.hasNext()) {
        Entry<String, String> entry = iter.next();
        confMap.put(entry.getKey(), entry.getValue());
    }/*from w  w  w . j a v a  2 s  . c  o m*/

    JobConf jobConf = new JobConf(hadoopConf);

    String in = tempDir.getRoot().toString() + "/specifiedColumns/in";
    String out = tempDir.getRoot().toString() + "/specifiedColumns/out";

    final Fields fields = new Fields("addressCode", "timestamp", "devicePowerEventList");

    final Fields innerFields = new Fields("power", "deviceType", "deviceId", "status");

    Tap lfsSource = new Lfs(new TrevniScheme(schema), in, SinkMode.REPLACE);

    TupleEntryCollector write = lfsSource.openForWrite(new HadoopFlowProcess(jobConf));

    List<TupleEntry> devicePowerEventList = new ArrayList<TupleEntry>();
    devicePowerEventList.add(new TupleEntry(innerFields, new Tuple(1300.0, 5, 0, 1)));
    devicePowerEventList.add(new TupleEntry(innerFields, new Tuple(3500.4, 4, 1, 0)));

    List<TupleEntry> devicePowerEventList2 = new ArrayList<TupleEntry>();
    devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(3570.0, 3, 0, 1)));
    devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(110.4, 2, 1, 0)));
    devicePowerEventList2.add(new TupleEntry(innerFields, new Tuple(250.9, 3, 3, 1)));

    write.add(new TupleEntry(fields, new Tuple("4874025000-514", 1356998460000L, devicePowerEventList)));
    write.add(new TupleEntry(fields, new Tuple("4725033000-4031", 1356998520000L, devicePowerEventList2)));

    write.close();

    Pipe writePipe = new Pipe("tuples to trevni");
    Tap lfsTrevniSource = new Lfs(new TrevniScheme(schema), in + "/*");
    Tap trevniSink = new Lfs(new TrevniScheme(schema), out);

    Flow flow = new HadoopFlowConnector(confMap).connect(lfsTrevniSource, trevniSink, writePipe);
    flow.complete();

    // Read the specified columns.      
    Tap trevniSource = new Lfs(new TrevniScheme(specifiedColumnsSchema), out + "/*");

    TupleEntryIterator iterator = trevniSource.openForRead(new HadoopFlowProcess(jobConf));

    assertTrue(iterator.hasNext());

    final TupleEntry readEntry1 = iterator.next();

    assertTrue(readEntry1.getString("addressCode").equals("4874025000-514"));
    assertEquals(2, ((List) readEntry1.getObject("devicePowerEventList")).size());
    assertEquals(1300.0, ((Tuple) ((List) readEntry1.getObject("devicePowerEventList")).get(0)).getDouble(0));

    final TupleEntry readEntry2 = iterator.next();

    assertTrue(readEntry2.getString("addressCode").equals("4725033000-4031"));
    assertEquals(3, ((List) readEntry2.getObject("devicePowerEventList")).size());
    assertEquals(110.4, ((Tuple) ((List) readEntry1.getObject("devicePowerEventList")).get(1)).getDouble(0));
}

From source file:cascading.ClusterTestCase.java

License:Open Source License

public JobConf getJobConf() {
    return new JobConf(jobConf);
}

From source file:cascading.flow.Flow.java

License:Open Source License

private void setJobConf(JobConf jobConf) {
    if (jobConf == null) // this is ok, getJobConf will pass a default parent in
        return;//from w  w w  .j a va2s . c o m

    this.jobConf = new JobConf(jobConf); // prevent local values from being shared
    this.jobConf.set("fs.http.impl", HttpFileSystem.class.getName());
    this.jobConf.set("fs.https.impl", HttpFileSystem.class.getName());
    this.jobConf.set("fs.s3tp.impl", S3HttpFileSystem.class.getName());

    // set the ID for future reference
    this.jobConf.set("cascading.flow.id", getID());
}

From source file:cascading.flow.Flow.java

License:Open Source License

/**
 * Method areSourcesNewer returns true if any source is newer than the given sinkModified date value.
 *
 * @param sinkModified of type long/*w w w. j  av a  2s.  c om*/
 * @return boolean
 * @throws IOException when
 */
public boolean areSourcesNewer(long sinkModified) throws IOException {
    JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident
    long sourceMod = 0;

    try {
        for (Tap source : sources.values()) {
            if (!source.pathExists(confCopy))
                throw new FlowException("source does not exist: " + source);

            sourceMod = source.getPathModified(confCopy);

            if (sinkModified < sourceMod)
                return true;
        }

        return false;
    } finally {
        if (LOG.isInfoEnabled())
            logInfo("source modification date at: " + new Date(sourceMod)); // not oldest, we didnt check them all
    }
}

From source file:cascading.flow.Flow.java

License:Open Source License

/**
 * Method getSinkModified returns the youngest modified date of any sink {@link Tap} managed by this Flow instance.
 * <p/>//from  ww  w  . ja  va2 s. c  o  m
 * If zero (0) is returned, atleast one of the sink resources does not exist. If minus one (-1) is returned,
 * atleast one of the sinks are marked for delete ({@link Tap#isReplace() returns true}).
 *
 * @return the sinkModified (type long) of this Flow object.
 * @throws IOException when
 */
public long getSinkModified() throws IOException {
    JobConf confCopy = new JobConf(getJobConf()); // let's not add unused values by accident
    long sinkModified = Long.MAX_VALUE;

    for (Tap sink : sinks.values()) {
        if (sink.isReplace() || sink.isUpdate())
            sinkModified = -1L;
        else {
            if (!sink.pathExists(confCopy))
                sinkModified = 0L;
            else
                sinkModified = Math.min(sinkModified, sink.getPathModified(confCopy)); // return youngest mod date
        }
    }

    if (LOG.isInfoEnabled()) {
        if (sinkModified == -1L)
            logInfo("atleast one sink is marked for delete");
        if (sinkModified == 0L)
            logInfo("atleast one sink does not exist");
        else
            logInfo("sink oldest modified date: " + new Date(sinkModified));
    }

    return sinkModified;
}

From source file:cascading.flow.FlowStep.java

License:Open Source License

private void initFromTraps(JobConf conf, Map<String, Tap> traps) throws IOException {
    if (!traps.isEmpty()) {
        JobConf trapConf = new JobConf(conf);

        for (Tap tap : traps.values())
            tap.sinkInit(trapConf);/*from   ww w .ja v a 2  s  . co  m*/
    }
}

From source file:cascading.flow.FlowStep.java

License:Open Source License

private void initFromSources(JobConf conf) throws IOException {
    JobConf[] fromJobs = new JobConf[sources.size()];
    int i = 0;//from   w ww. j  a v a  2  s .c  o  m

    for (Tap tap : sources.keySet()) {
        fromJobs[i] = new JobConf(conf);
        tap.sourceInit(fromJobs[i]);
        fromJobs[i].set("cascading.step.source", Util.serializeBase64(tap));
        i++;
    }

    MultiInputFormat.addInputFormat(conf, fromJobs);
}

From source file:cascading.flow.hadoop.HadoopStepStats.java

License:Open Source License

public void captureJobStats() {
    RunningJob runningJob = getRunningJob();

    if (runningJob == null)
        return;//from www  .  j a v a  2s .c  om

    JobConf ranJob = new JobConf(runningJob.getJobFile());

    setNumMapTasks(ranJob.getNumMapTasks());
    setNumReducerTasks(ranJob.getNumReduceTasks());
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testFlow() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    JobConf defaultConf = (JobConf) ((BaseHadoopPlatform) getPlatform()).getConfiguration();

    JobConf conf = new JobConf(defaultConf);
    conf.setJobName("mrflow");

    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(conf, new Path(inputFileApache));

    String outputPath = getOutputPath("flowTest");
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    Flow flow = new MapReduceFlow("mrflow", conf, true);

    validateLength(new Hfs(new TextLine(), inputFileApache).openForRead(new HadoopFlowProcess(defaultConf)),
            10);// w ww . j  a  v  a  2  s. co m

    flow.complete();

    validateLength(new Hfs(new TextLine(), outputPath).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}

From source file:cascading.flow.hadoop.MapReduceFlowPlatformTest.java

License:Open Source License

@Test
public void testCascade() throws IOException {
    getPlatform().copyFromLocal(inputFileApache);

    // Setup two standard cascading flows that will generate the input for the first MapReduceFlow
    Tap source1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(inputFileApache, false));
    String sinkPath4 = getOutputPath("flow4");
    Tap sink1 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath4, true), SinkMode.REPLACE);
    Flow firstFlow = getPlatform().getFlowConnector(getProperties()).connect(source1, sink1,
            new Pipe("first-flow"));

    String sinkPath5 = getOutputPath("flow5");
    Tap sink2 = new Hfs(new TextLine(new Fields("offset", "line")), remove(sinkPath5, true), SinkMode.REPLACE);
    Flow secondFlow = getPlatform().getFlowConnector(getProperties()).connect(sink1, sink2,
            new Pipe("second-flow"));

    JobConf defaultConf = HadoopPlanner.createJobConf(getProperties());

    JobConf firstConf = new JobConf(defaultConf);
    firstConf.setJobName("first-mr");

    firstConf.setOutputKeyClass(LongWritable.class);
    firstConf.setOutputValueClass(Text.class);

    firstConf.setMapperClass(IdentityMapper.class);
    firstConf.setReducerClass(IdentityReducer.class);

    firstConf.setInputFormat(TextInputFormat.class);
    firstConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(firstConf, new Path(remove(sinkPath5, true)));
    String sinkPath1 = getOutputPath("flow1");
    FileOutputFormat.setOutputPath(firstConf, new Path(remove(sinkPath1, true)));

    Flow firstMR = new MapReduceFlow(firstConf, true);

    JobConf secondConf = new JobConf(defaultConf);
    secondConf.setJobName("second-mr");

    secondConf.setOutputKeyClass(LongWritable.class);
    secondConf.setOutputValueClass(Text.class);

    secondConf.setMapperClass(IdentityMapper.class);
    secondConf.setReducerClass(IdentityReducer.class);

    secondConf.setInputFormat(TextInputFormat.class);
    secondConf.setOutputFormat(TextOutputFormat.class);

    FileInputFormat.setInputPaths(secondConf, new Path(remove(sinkPath1, true)));
    String sinkPath2 = getOutputPath("flow2");
    FileOutputFormat.setOutputPath(secondConf, new Path(remove(sinkPath2, true)));

    Flow secondMR = new MapReduceFlow(secondConf, true);

    Job job = new Job(defaultConf);
    job.setJobName("third-mr");

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(org.apache.hadoop.mapreduce.Mapper.class);
    job.setReducerClass(org.apache.hadoop.mapreduce.Reducer.class);

    job.setInputFormatClass(org.apache.hadoop.mapreduce.lib.input.TextInputFormat.class);
    job.setOutputFormatClass(org.apache.hadoop.mapreduce.lib.output.TextOutputFormat.class);
    job.getConfiguration().set("mapred.mapper.new-api", "true");
    job.getConfiguration().set("mapred.reducer.new-api", "true");

    org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new Path(remove(sinkPath2, true)));
    String sinkPath3 = getOutputPath("flow3");
    org.apache.hadoop.mapreduce.lib.output.FileOutputFormat.setOutputPath(job,
            new Path(remove(sinkPath3, true)));

    Flow thirdMR = new MapReduceFlow(new JobConf(job.getConfiguration()), true);

    CascadeConnector cascadeConnector = new CascadeConnector();

    // pass out of order
    Cascade cascade = cascadeConnector.connect(firstFlow, secondFlow, thirdMR, firstMR, secondMR);

    cascade.complete();// ww  w  .  jav  a2  s  .  co m

    validateLength(new Hfs(new TextLine(), sinkPath3).openForRead(new HadoopFlowProcess(defaultConf)), 10);
}