Example usage for org.apache.hadoop.mapred JobConf setInt

List of usage examples for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:org.apache.tez.mapreduce.examples.MRRSleepJob.java

License:Apache License

public DAG createDAG(FileSystem remoteFs, Configuration conf, Path remoteStagingDir, int numMapper,
        int numReducer, int iReduceStagesCount, int numIReducer, long mapSleepTime, int mapSleepCount,
        long reduceSleepTime, int reduceSleepCount, long iReduceSleepTime, int iReduceSleepCount,
        boolean writeSplitsToDFS, boolean generateSplitsInAM) throws IOException, YarnException {

    Configuration mapStageConf = new JobConf(conf);
    mapStageConf.setInt(MRJobConfig.NUM_MAPS, numMapper);
    mapStageConf.setLong(MAP_SLEEP_TIME, mapSleepTime);
    mapStageConf.setLong(REDUCE_SLEEP_TIME, reduceSleepTime);
    mapStageConf.setLong(IREDUCE_SLEEP_TIME, iReduceSleepTime);
    mapStageConf.setInt(MAP_SLEEP_COUNT, mapSleepCount);
    mapStageConf.setInt(REDUCE_SLEEP_COUNT, reduceSleepCount);
    mapStageConf.setInt(IREDUCE_SLEEP_COUNT, iReduceSleepCount);
    mapStageConf.setInt(IREDUCE_STAGES_COUNT, iReduceStagesCount);
    mapStageConf.setInt(IREDUCE_TASKS_COUNT, numIReducer);
    mapStageConf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    mapStageConf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    if (numIReducer == 0 && numReducer == 0) {
        mapStageConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, NullOutputFormat.class.getName());
    }/*from   w  w w . j  a  v a2s  . c o m*/

    MRHelpers.translateMRConfToTez(mapStageConf);

    Configuration[] intermediateReduceStageConfs = null;
    if (iReduceStagesCount > 0 && numIReducer > 0) {
        intermediateReduceStageConfs = new JobConf[iReduceStagesCount];
        for (int i = 1; i <= iReduceStagesCount; ++i) {
            JobConf iReduceStageConf = new JobConf(conf);
            iReduceStageConf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, iReduceSleepTime);
            iReduceStageConf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, iReduceSleepCount);
            iReduceStageConf.setInt(MRJobConfig.NUM_REDUCES, numIReducer);
            iReduceStageConf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
            iReduceStageConf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
            iReduceStageConf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
            iReduceStageConf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());

            MRHelpers.translateMRConfToTez(iReduceStageConf);
            intermediateReduceStageConfs[i - 1] = iReduceStageConf;
        }
    }

    Configuration finalReduceConf = null;
    if (numReducer > 0) {
        finalReduceConf = new JobConf(conf);
        finalReduceConf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, reduceSleepTime);
        finalReduceConf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, reduceSleepCount);
        finalReduceConf.setInt(MRJobConfig.NUM_REDUCES, numReducer);
        finalReduceConf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
        finalReduceConf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
        finalReduceConf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
        finalReduceConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, NullOutputFormat.class.getName());

        MRHelpers.translateMRConfToTez(finalReduceConf);
    }

    MRHelpers.configureMRApiUsage(mapStageConf);
    if (iReduceStagesCount > 0 && numIReducer > 0) {
        for (int i = 0; i < iReduceStagesCount; ++i) {
            MRHelpers.configureMRApiUsage(intermediateReduceStageConfs[i]);
        }
    }
    if (numReducer > 0) {
        MRHelpers.configureMRApiUsage(finalReduceConf);
    }

    DataSourceDescriptor dataSource = null;
    if (!generateSplitsInAM && writeSplitsToDFS) {

        LOG.info("Writing splits to DFS");
        dataSource = MRInputHelpers.configureMRInputWithLegacySplitGeneration(mapStageConf, remoteStagingDir,
                true);
    } else {
        dataSource = MRInputLegacy.createConfigBuilder(mapStageConf, SleepInputFormat.class)
                .generateSplitsInAM(generateSplitsInAM).build();
    }

    DAG dag = DAG.create("MRRSleepJob");
    String jarPath = ClassUtil.findContainingJar(getClass());
    if (jarPath == null) {
        throw new TezUncheckedException(
                "Could not find any jar containing" + " MRRSleepJob.class in the classpath");
    }
    Path remoteJarPath = remoteFs.makeQualified(new Path(remoteStagingDir, "dag_job.jar"));
    remoteFs.copyFromLocalFile(new Path(jarPath), remoteJarPath);
    FileStatus jarFileStatus = remoteFs.getFileStatus(remoteJarPath);

    TokenCache.obtainTokensForNamenodes(this.credentials, new Path[] { remoteJarPath }, mapStageConf);

    Map<String, LocalResource> commonLocalResources = new HashMap<String, LocalResource>();
    LocalResource dagJarLocalRsrc = LocalResource.newInstance(ConverterUtils.getYarnUrlFromPath(remoteJarPath),
            LocalResourceType.FILE, LocalResourceVisibility.APPLICATION, jarFileStatus.getLen(),
            jarFileStatus.getModificationTime());
    commonLocalResources.put("dag_job.jar", dagJarLocalRsrc);

    List<Vertex> vertices = new ArrayList<Vertex>();

    UserPayload mapUserPayload = TezUtils.createUserPayloadFromConf(mapStageConf);
    int numTasks = generateSplitsInAM ? -1 : numMapper;

    Map<String, String> mapEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, mapEnv, true);
    Map<String, String> reduceEnv = Maps.newHashMap();
    MRHelpers.updateEnvBasedOnMRTaskEnv(mapStageConf, reduceEnv, false);

    Vertex mapVertex = Vertex.create("map",
            ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(mapUserPayload), numTasks,
            MRHelpers.getResourceForMRMapper(mapStageConf));
    mapVertex.addTaskLocalFiles(commonLocalResources).addDataSource("MRInput", dataSource)
            .setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRMapper(mapStageConf)).setTaskEnvironment(mapEnv);
    vertices.add(mapVertex);

    if (iReduceStagesCount > 0 && numIReducer > 0) {
        for (int i = 0; i < iReduceStagesCount; ++i) {
            Configuration iconf = intermediateReduceStageConfs[i];
            UserPayload iReduceUserPayload = TezUtils.createUserPayloadFromConf(iconf);
            Vertex ivertex = Vertex.create("ireduce" + (i + 1),
                    ProcessorDescriptor.create(ReduceProcessor.class.getName())
                            .setUserPayload(iReduceUserPayload),
                    numIReducer, MRHelpers.getResourceForMRReducer(intermediateReduceStageConfs[i]));
            ivertex.addTaskLocalFiles(commonLocalResources)
                    .setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(intermediateReduceStageConfs[i]))
                    .setTaskEnvironment(reduceEnv);
            vertices.add(ivertex);
        }
    }

    Vertex finalReduceVertex = null;
    if (numReducer > 0) {
        UserPayload reducePayload = TezUtils.createUserPayloadFromConf(finalReduceConf);
        finalReduceVertex = Vertex.create("reduce",
                ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(reducePayload),
                numReducer, MRHelpers.getResourceForMRReducer(finalReduceConf));
        finalReduceVertex.addTaskLocalFiles(commonLocalResources)
                .addDataSink("MROutput",
                        MROutputLegacy.createConfigBuilder(finalReduceConf, NullOutputFormat.class).build())
                .setTaskLaunchCmdOpts(MRHelpers.getJavaOptsForMRReducer(finalReduceConf))
                .setTaskEnvironment(reduceEnv);
        vertices.add(finalReduceVertex);
    } else {
        // Map only job
        mapVertex.addDataSink("MROutput",
                MROutputLegacy.createConfigBuilder(mapStageConf, NullOutputFormat.class).build());
    }

    Map<String, String> partitionerConf = Maps.newHashMap();
    partitionerConf.put(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());
    OrderedPartitionedKVEdgeConfig edgeConf = OrderedPartitionedKVEdgeConfig
            .newBuilder(IntWritable.class.getName(), IntWritable.class.getName(),
                    HashPartitioner.class.getName(), partitionerConf)
            .configureInput().useLegacyInput().done().build();

    for (int i = 0; i < vertices.size(); ++i) {
        dag.addVertex(vertices.get(i));
        if (i != 0) {
            dag.addEdge(
                    Edge.create(vertices.get(i - 1), vertices.get(i), edgeConf.createDefaultEdgeProperty()));
        }
    }

    return dag;
}

From source file:org.apache.tez.mapreduce.hadoop.TestDeprecatedKeys.java

License:Apache License

@Test(timeout = 5000)
public void verifyReduceKeyTranslation() {
    JobConf jobConf = new JobConf();

    jobConf.setFloat(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT, 0.4f);
    jobConf.setLong(MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES, 20000l);
    jobConf.setInt(MRJobConfig.IO_SORT_FACTOR, 2000);
    jobConf.setFloat(MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT, 0.55f);
    jobConf.setFloat(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD, 0.60f);
    jobConf.setFloat(MRJobConfig.SHUFFLE_MERGE_PERCENT, 0.22f);
    jobConf.setBoolean(MRJobConfig.REDUCE_MEMTOMEM_ENABLED, true);
    jobConf.setFloat(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT, 0.33f);

    MRHelpers.translateMRConfToTez(jobConf);

    assertEquals(0.4f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0f),
            0.01f);/*from  w  ww . j a  v a  2 s  .  co  m*/
    assertEquals(20000l, jobConf.getLong(Constants.TEZ_RUNTIME_TASK_MEMORY, 0));
    assertEquals(2000, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 0));
    assertEquals(0.55f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 0),
            0.01f);
    assertEquals(0.60f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 0),
            0.01f);
    assertEquals(0.22f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 0), 0.01f);
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, false));
    assertEquals(0.33f,
            jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 0), 0.01f);
}

From source file:org.apache.tez.mapreduce.hadoop.TestDeprecatedKeys.java

License:Apache License

@Test(timeout = 5000)
/**//from   ww  w .j a va  2 s  .c o  m
 * Set of keys that can be overriden at tez runtime
 */
public void verifyTezOverridenKeys() {
    JobConf jobConf = new JobConf();
    jobConf.setInt(MRJobConfig.IO_SORT_FACTOR, 2000);
    jobConf.setInt(MRJobConfig.IO_SORT_MB, 100);
    jobConf.setInt(MRJobConfig.COUNTERS_MAX_KEY, 100);

    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 1000);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 200);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, true);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, 20);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SORT_SPILL_PERCENT, 0.2f);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 20);
    jobConf.setInt(Constants.TEZ_RUNTIME_TASK_MEMORY, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_FAILURES_LIMIT, 10);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_NOTIFY_READERROR, true);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT, 10);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT, 10);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL, true);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 10.0f);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 10.0f);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 10.0f);
    jobConf.setInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 10);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, true);
    jobConf.setFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 10.0f);
    jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, "DefaultSorter");
    jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_GROUP_COMPARATOR_CLASS, "groupComparator");
    jobConf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_SECONDARY_COMPARATOR_CLASS, "SecondaryComparator");

    jobConf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, false);
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, true);

    MRHelpers.translateMRConfToTez(jobConf);

    assertEquals(1000, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, 0));
    assertEquals(200, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_MB, 100));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD, false));
    assertEquals(20, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IFILE_READAHEAD_BYTES, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_INDEX_CACHE_MEMORY_LIMIT_BYTES, 0));
    assertEquals(20, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_COMBINE_MIN_SPILLS, 0));
    assertEquals(10, jobConf.getInt(Constants.TEZ_RUNTIME_TASK_MEMORY, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_PARALLEL_COPIES, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_FAILURES_LIMIT, 0));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_NOTIFY_READERROR, false));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_CONNECT_TIMEOUT, 0));
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_READ_TIMEOUT, 0));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_SSL, false));
    assertEquals(10.0f,
            jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_FETCH_BUFFER_PERCENT, 0.0f), 0.0f);
    assertEquals(10.0f,
            jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMORY_LIMIT_PERCENT, 0.0f), 0.0f);
    assertEquals(10.0f, jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MERGE_PERCENT, 0.0f),
            0.0f);
    assertEquals(10, jobConf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_MEMTOMEM_SEGMENTS, 0));
    assertEquals(true, jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_SHUFFLE_ENABLE_MEMTOMEM, false));
    assertEquals(10.0f,
            jobConf.getFloat(TezRuntimeConfiguration.TEZ_RUNTIME_INPUT_POST_MERGE_BUFFER_PERCENT, 0.0f), 0.0f);
    assertEquals("DefaultSorter", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, ""));
    assertEquals("groupComparator",
            jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_GROUP_COMPARATOR_CLASS, ""));
    assertEquals("SecondaryComparator",
            jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_SECONDARY_COMPARATOR_CLASS, ""));
    assertEquals("DefaultSorter", jobConf.get(TezRuntimeConfiguration.TEZ_RUNTIME_INTERNAL_SORTER_CLASS, ""));
    assertTrue(jobConf.getBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, false));

    assertNull(jobConf.get(MRConfig.MAPRED_IFILE_READAHEAD));
    assertNull(jobConf.get(MRConfig.MAPRED_IFILE_READAHEAD_BYTES));
    assertNull(jobConf.get(MRJobConfig.RECORDS_BEFORE_PROGRESS));
    assertNull(jobConf.get(MRJobConfig.IO_SORT_FACTOR));
    assertNull(jobConf.get(MRJobConfig.IO_SORT_MB));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_READ_TIMEOUT));
    assertNull(jobConf.get(MRJobConfig.INDEX_CACHE_MEMORY_LIMIT));
    assertNull(jobConf.get(MRJobConfig.MAP_COMBINE_MIN_SPILLS));
    assertNull(jobConf.get(MRJobConfig.REDUCE_MEMORY_TOTAL_BYTES));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_PARALLEL_COPIES));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_FETCH_FAILURES));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_NOTIFY_READERROR));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_CONNECT_TIMEOUT));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_READ_TIMEOUT));
    assertNull(jobConf.get(MRConfig.SHUFFLE_SSL_ENABLED_KEY));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_INPUT_BUFFER_PERCENT));
    assertNull(jobConf.get(MRJobConfig.SHUFFLE_MEMORY_LIMIT_PERCENT));
    assertNull(jobConf.get(MRJobConfig.REDUCE_MEMTOMEM_THRESHOLD));
    assertNull(jobConf.get(MRJobConfig.REDUCE_MEMTOMEM_ENABLED));
    assertNull(jobConf.get(MRJobConfig.REDUCE_INPUT_BUFFER_PERCENT));
    assertNull(jobConf.get(MRJobConfig.GROUP_COMPARATOR_CLASS));
    assertNull(jobConf.get(MRJobConfig.GROUP_COMPARATOR_CLASS));
    assertNull(jobConf.get("map.sort.class"));
}

From source file:org.apache.tez.mapreduce.hadoop.TestMRHelpers.java

License:Apache License

@Test(timeout = 5000)
public void testContainerResourceConstruction() {
    JobConf conf = new JobConf(new Configuration());
    Resource mapResource = MRHelpers.getResourceForMRMapper(conf);
    Resource reduceResource = MRHelpers.getResourceForMRReducer(conf);

    Assert.assertEquals(MRJobConfig.DEFAULT_MAP_CPU_VCORES, mapResource.getVirtualCores());
    Assert.assertEquals(MRJobConfig.DEFAULT_MAP_MEMORY_MB, mapResource.getMemory());
    Assert.assertEquals(MRJobConfig.DEFAULT_REDUCE_CPU_VCORES, reduceResource.getVirtualCores());
    Assert.assertEquals(MRJobConfig.DEFAULT_REDUCE_MEMORY_MB, reduceResource.getMemory());

    conf.setInt(MRJobConfig.MAP_CPU_VCORES, 2);
    conf.setInt(MRJobConfig.MAP_MEMORY_MB, 123);
    conf.setInt(MRJobConfig.REDUCE_CPU_VCORES, 20);
    conf.setInt(MRJobConfig.REDUCE_MEMORY_MB, 1234);

    mapResource = MRHelpers.getResourceForMRMapper(conf);
    reduceResource = MRHelpers.getResourceForMRReducer(conf);

    Assert.assertEquals(2, mapResource.getVirtualCores());
    Assert.assertEquals(123, mapResource.getMemory());
    Assert.assertEquals(20, reduceResource.getVirtualCores());
    Assert.assertEquals(1234, reduceResource.getMemory());
}

From source file:org.apache.tez.mapreduce.processor.map.TestMapProcessor.java

License:Apache License

@Test(timeout = 5000)
public void testMapProcessor() throws Exception {
    String dagName = "mrdag0";
    String vertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
    JobConf jobConf = new JobConf(defaultConf);
    setUpJobConf(jobConf);//from   w  w  w .  j a  v  a2 s  .co  m

    MRHelpers.translateMRConfToTez(jobConf);
    jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);

    jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);

    jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
            new Path(workDir, "localized-resources").toUri().toString());

    Path mapInput = new Path(workDir, "map0");

    MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput);

    InputSpec mapInputSpec = new InputSpec("NullSrcVertex",
            InputDescriptor.create(MRInputLegacy.class.getName())
                    .setUserPayload(UserPayload.create(ByteBuffer.wrap(MRRuntimeProtos.MRInputUserPayloadProto
                            .newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf))
                            .build().toByteArray()))),
            1);
    OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex",
            OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName())
                    .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);

    LogicalIOProcessorRuntimeTask task = MapUtils.createLogicalTask(localFs, workDir, jobConf, 0,
            new Path(workDir, "map0"), new TestUmbilical(), dagName, vertexName,
            Collections.singletonList(mapInputSpec), Collections.singletonList(mapOutputSpec));

    task.initialize();
    task.run();
    task.close();

    OutputContext outputContext = task.getOutputContexts().iterator().next();
    TezTaskOutput mapOutputs = new TezTaskOutputFiles(jobConf, outputContext.getUniqueIdentifier());

    // TODO NEWTEZ FIXME OutputCommitter verification
    //    MRTask mrTask = (MRTask)t.getProcessor();
    //    Assert.assertEquals(TezNullOutputCommitter.class.getName(), mrTask
    //        .getCommitter().getClass().getName());
    //    t.close();

    Path mapOutputFile = getMapOutputFile(jobConf, outputContext);
    LOG.info("mapOutputFile = " + mapOutputFile);
    IFile.Reader reader = new IFile.Reader(localFs, mapOutputFile, null, null, null, false, 0, -1);
    LongWritable key = new LongWritable();
    Text value = new Text();
    DataInputBuffer keyBuf = new DataInputBuffer();
    DataInputBuffer valueBuf = new DataInputBuffer();
    long prev = Long.MIN_VALUE;
    while (reader.nextRawKey(keyBuf)) {
        reader.nextRawValue(valueBuf);
        key.readFields(keyBuf);
        value.readFields(valueBuf);
        if (prev != Long.MIN_VALUE) {
            assert (prev <= key.get());
            prev = key.get();
        }
        LOG.info("key = " + key.get() + "; value = " + value);
    }
    reader.close();
}

From source file:org.apache.tez.mapreduce.processor.MRTask.java

License:Apache License

public void localizeConfiguration(JobConf jobConf) throws IOException, InterruptedException {
    jobConf.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
    jobConf.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
    jobConf.setInt(JobContext.TASK_PARTITION, taskAttemptId.getTaskID().getId());
    jobConf.set(JobContext.ID, taskAttemptId.getJobID().toString());

    jobConf.setBoolean(MRJobConfig.TASK_ISMAP, isMap);

    Path outputPath = FileOutputFormat.getOutputPath(jobConf);
    if (outputPath != null) {
        if ((committer instanceof FileOutputCommitter)) {
            FileOutputFormat.setWorkOutputPath(jobConf,
                    ((FileOutputCommitter) committer).getTaskAttemptPath(taskAttemptContext));
        } else {//from   w  w w. ja v  a2 s.  com
            FileOutputFormat.setWorkOutputPath(jobConf, outputPath);
        }
    }
}

From source file:org.apache.tez.mapreduce.processor.reduce.TestReduceProcessor.java

License:Apache License

@Test(timeout = 5000)
public void testReduceProcessor() throws Exception {
    final String dagName = "mrdag0";
    String mapVertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
    String reduceVertexName = MultiStageMRConfigUtil.getFinalReduceVertexName();
    JobConf jobConf = new JobConf(defaultConf);
    setUpJobConf(jobConf);/*from   w  w w  .  j a v  a 2  s  .  c o  m*/

    MRHelpers.translateMRConfToTez(jobConf);
    jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);

    jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
            new Path(workDir, "localized-resources").toUri().toString());
    jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);

    Path mapInput = new Path(workDir, "map0");
    MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput);

    InputSpec mapInputSpec = new InputSpec("NullSrcVertex",
            InputDescriptor.create(MRInputLegacy.class.getName())
                    .setUserPayload(UserPayload.create(ByteBuffer.wrap(MRRuntimeProtos.MRInputUserPayloadProto
                            .newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf))
                            .build().toByteArray()))),
            1);
    OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex",
            OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName())
                    .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);
    // Run a map

    TestUmbilical testUmbilical = new TestUmbilical();

    LogicalIOProcessorRuntimeTask mapTask = MapUtils.createLogicalTask(localFs, workDir, jobConf, 0, mapInput,
            testUmbilical, dagName, mapVertexName, Collections.singletonList(mapInputSpec),
            Collections.singletonList(mapOutputSpec));

    mapTask.initialize();
    mapTask.run();
    mapTask.close();

    // One VME, One DME
    Assert.assertEquals(2, testUmbilical.getEvents().size());
    Assert.assertEquals(EventType.VERTEX_MANAGER_EVENT, testUmbilical.getEvents().get(0).getEventType());
    Assert.assertEquals(EventType.COMPOSITE_DATA_MOVEMENT_EVENT,
            testUmbilical.getEvents().get(1).getEventType());

    CompositeDataMovementEvent cdmEvent = (CompositeDataMovementEvent) testUmbilical.getEvents().get(1)
            .getEvent();
    Assert.assertEquals(1, cdmEvent.getCount());
    DataMovementEvent dme = cdmEvent.getEvents().iterator().next();
    dme.setTargetIndex(0);

    LOG.info("Starting reduce...");

    JobTokenIdentifier identifier = new JobTokenIdentifier(new Text(dagName));
    JobTokenSecretManager jobTokenSecretManager = new JobTokenSecretManager();
    Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>(identifier, jobTokenSecretManager);
    shuffleToken.setService(identifier.getJobId());

    jobConf.setOutputFormat(SequenceFileOutputFormat.class);
    jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
            new Path(workDir, "localized-resources").toUri().toString());
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_OPTIMIZE_LOCAL_FETCH, true);
    FileOutputFormat.setOutputPath(jobConf, new Path(workDir, "output"));
    ProcessorDescriptor reduceProcessorDesc = ProcessorDescriptor.create(ReduceProcessor.class.getName())
            .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf));

    InputSpec reduceInputSpec = new InputSpec(mapVertexName,
            InputDescriptor.create(OrderedGroupedInputLegacy.class.getName())
                    .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);
    OutputSpec reduceOutputSpec = new OutputSpec("NullDestinationVertex", OutputDescriptor
            .create(MROutputLegacy.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);

    // Now run a reduce
    TaskSpec taskSpec = new TaskSpec(TezTestUtils.getMockTaskAttemptId(0, 1, 0, 0), dagName, reduceVertexName,
            -1, reduceProcessorDesc, Collections.singletonList(reduceInputSpec),
            Collections.singletonList(reduceOutputSpec), null);

    Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>();
    serviceConsumerMetadata.put(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID,
            ShuffleUtils.convertJobTokenToBytes(shuffleToken));
    Map<String, String> serviceProviderEnvMap = new HashMap<String, String>();
    ByteBuffer shufflePortBb = ByteBuffer.allocate(4).putInt(0, 8000);
    AuxiliaryServiceHelper.setServiceDataIntoEnv(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, shufflePortBb,
            serviceProviderEnvMap);

    LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask(taskSpec, 0, jobConf,
            new String[] { workDir.toString() }, new TestUmbilical(), serviceConsumerMetadata,
            serviceProviderEnvMap, HashMultimap.<String, String>create(), null, "",
            new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory());

    List<Event> destEvents = new LinkedList<Event>();
    destEvents.add(dme);
    task.initialize();
    OrderedGroupedInputLegacy sortedOut = (OrderedGroupedInputLegacy) task.getInputs().values().iterator()
            .next();
    sortedOut.handleEvents(destEvents);
    task.run();
    task.close();

    // MRTask mrTask = (MRTask)t.getProcessor();
    // TODO NEWTEZ Verify the partitioner has not been created
    // Likely not applicable anymore.
    // Assert.assertNull(mrTask.getPartitioner());

    // Only a task commit happens, hence the data is still in the temporary directory.
    Path reduceOutputDir = new Path(new Path(workDir, "output"),
            "_temporary/0/" + IDConverter.toMRTaskIdForOutput(TezTestUtils.getMockTaskId(0, 1, 0)));

    Path reduceOutputFile = new Path(reduceOutputDir, "part-v001-o000-00000");

    SequenceFile.Reader reader = new SequenceFile.Reader(localFs, reduceOutputFile, jobConf);

    LongWritable key = new LongWritable();
    Text value = new Text();
    long prev = Long.MIN_VALUE;
    while (reader.next(key, value)) {
        if (prev != Long.MIN_VALUE) {
            Assert.assertTrue(prev < key.get());
            prev = key.get();
        }
    }

    reader.close();
}

From source file:org.apache.tez.mapreduce.task.MRRuntimeTask.java

License:Apache License

private static void configureMRTask(JobConf job, MRTask task) throws IOException, InterruptedException {

    Credentials credentials = UserGroupInformation.getCurrentUser().getCredentials();
    job.setCredentials(credentials);//  w ww  .j  ava2  s .c o  m
    // TODO Can this be avoided all together. Have the MRTezOutputCommitter use
    // the Tez parameter.
    // TODO This could be fetched from the env if YARN is setting it for all
    // Containers.
    // Set it in conf, so as to be able to be used the the OutputCommitter.
    job.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, job.getInt(TezJobConfig.APPLICATION_ATTEMPT_ID, -1));

    job.setClass(MRConfig.TASK_LOCAL_OUTPUT_CLASS, YarnOutputFiles.class, MapOutputFile.class); // MR

    Token<JobTokenIdentifier> jobToken = TokenCache.getJobToken(credentials);
    if (jobToken != null) {
        // Will MR ever run without a job token.
        SecretKey sk = JobTokenSecretManager.createSecretKey(jobToken.getPassword());
        task.setJobTokenSecret(sk);
    } else {
        LOG.warn("No job token set");
    }

    job.set(MRJobConfig.JOB_LOCAL_DIR, job.get(TezJobConfig.JOB_LOCAL_DIR));
    job.set(MRConfig.LOCAL_DIR, job.get(TezJobConfig.LOCAL_DIRS));
    if (job.get(TezJobConfig.DAG_CREDENTIALS_BINARY) != null) {
        job.set(MRJobConfig.MAPREDUCE_JOB_CREDENTIALS_BINARY, job.get(TezJobConfig.DAG_CREDENTIALS_BINARY));
    }

    // setup the child's attempt directories
    // Do the task-type specific localization
    task.localizeConfiguration(job);

    // Set up the DistributedCache related configs
    setupDistributedCacheConfig(job);

    task.setConf(job);
}

From source file:org.apache.tez.mapreduce.TestMRRJobsDAGApi.java

License:Apache License

public State testMRRSleepJobDagSubmitCore(boolean dagViaRPC, boolean killDagWhileRunning,
        boolean closeSessionBeforeSubmit, TezClient reUseTezSession, boolean genSplitsInAM,
        Class<? extends InputInitializer> initializerClass, Map<String, LocalResource> additionalLocalResources)
        throws IOException, InterruptedException, TezException, ClassNotFoundException, YarnException {
    LOG.info("\n\n\nStarting testMRRSleepJobDagSubmit().");

    JobConf stage1Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage2Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage3Conf = new JobConf(mrrTezCluster.getConfig());

    stage1Conf.setLong(MRRSleepJob.MAP_SLEEP_TIME, 1);
    stage1Conf.setInt(MRRSleepJob.MAP_SLEEP_COUNT, 1);
    stage1Conf.setInt(MRJobConfig.NUM_MAPS, 1);
    stage1Conf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    stage1Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());

    stage2Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage2Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage2Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage2Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());

    stage3Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage3Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage3Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage3Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());

    MRHelpers.translateMRConfToTez(stage1Conf);
    MRHelpers.translateMRConfToTez(stage2Conf);
    MRHelpers.translateMRConfToTez(stage3Conf);
    MRHelpers.configureMRApiUsage(stage1Conf);
    MRHelpers.configureMRApiUsage(stage2Conf);
    MRHelpers.configureMRApiUsage(stage3Conf);

    Path remoteStagingDir = remoteFs
            .makeQualified(new Path("/tmp", String.valueOf(new Random().nextInt(100000))));
    TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir);

    UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
    UserPayload stage2Payload = TezUtils.createUserPayloadFromConf(stage2Conf);
    UserPayload stage3Payload = TezUtils.createUserPayloadFromConf(stage3Conf);

    DAG dag = DAG.create("testMRRSleepJobDagSubmit-" + random.nextInt(1000));

    Class<? extends InputInitializer> inputInitializerClazz = genSplitsInAM
            ? (initializerClass == null ? MRInputAMSplitGenerator.class : initializerClass)
            : null;/*  ww w. j  av  a2s.c om*/
    LOG.info("Using initializer class: " + initializerClass);

    DataSourceDescriptor dsd;
    if (!genSplitsInAM) {
        dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, remoteStagingDir, true);
    } else {
        if (initializerClass == null) {
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).build();
        } else {
            InputInitializerDescriptor iid = InputInitializerDescriptor.create(inputInitializerClazz.getName());
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class)
                    .setCustomInitializerDescriptor(iid).build();
        }
    }

    Vertex stage1Vertex = Vertex.create("map",
            ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(stage1Payload),
            dsd.getNumberOfShards(), Resource.newInstance(256, 1));
    stage1Vertex.addDataSource("MRInput", dsd);
    Vertex stage2Vertex = Vertex.create("ireduce",
            ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage2Payload), 1,
            Resource.newInstance(256, 1));
    Vertex stage3Vertex = Vertex.create("reduce",
            ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage3Payload), 1,
            Resource.newInstance(256, 1));
    stage3Conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, true);
    DataSinkDescriptor dataSinkDescriptor = MROutputLegacy
            .createConfigBuilder(stage3Conf, NullOutputFormat.class).build();
    Assert.assertFalse(dataSinkDescriptor.getOutputDescriptor().getHistoryText().isEmpty());
    stage3Vertex.addDataSink("MROutput", dataSinkDescriptor);

    // TODO env, resources

    dag.addVertex(stage1Vertex);
    dag.addVertex(stage2Vertex);
    dag.addVertex(stage3Vertex);

    Edge edge1 = Edge.create(stage1Vertex, stage2Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER,
            DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL,
            OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage2Payload),
            InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage2Payload)));
    Edge edge2 = Edge.create(stage2Vertex, stage3Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER,
            DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL,
            OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage3Payload),
            InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage3Payload)));

    dag.addEdge(edge1);
    dag.addEdge(edge2);

    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());

    DAGClient dagClient = null;
    boolean reuseSession = reUseTezSession != null;
    TezClient tezSession = null;
    if (!dagViaRPC) {
        Preconditions.checkArgument(reuseSession == false);
    }
    if (!reuseSession) {
        TezConfiguration tempTezconf = new TezConfiguration(tezConf);
        if (!dagViaRPC) {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false);
        } else {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
        }
        tezSession = TezClient.create("testsession", tempTezconf);
        tezSession.start();
    } else {
        tezSession = reUseTezSession;
    }
    if (!dagViaRPC) {
        // TODO Use utility method post TEZ-205 to figure out AM arguments etc.
        dagClient = tezSession.submitDAG(dag);
    }

    if (dagViaRPC && closeSessionBeforeSubmit) {
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(mrrTezCluster.getConfig());
        yarnClient.start();
        boolean sentKillSession = false;
        while (true) {
            Thread.sleep(500l);
            ApplicationReport appReport = yarnClient
                    .getApplicationReport(tezSession.getAppMasterApplicationId());
            if (appReport == null) {
                continue;
            }
            YarnApplicationState appState = appReport.getYarnApplicationState();
            if (!sentKillSession) {
                if (appState == YarnApplicationState.RUNNING) {
                    tezSession.stop();
                    sentKillSession = true;
                }
            } else {
                if (appState == YarnApplicationState.FINISHED || appState == YarnApplicationState.KILLED
                        || appState == YarnApplicationState.FAILED) {
                    LOG.info("Application completed after sending session shutdown" + ", yarnApplicationState="
                            + appState + ", finalAppStatus=" + appReport.getFinalApplicationStatus());
                    Assert.assertEquals(YarnApplicationState.FINISHED, appState);
                    Assert.assertEquals(FinalApplicationStatus.SUCCEEDED,
                            appReport.getFinalApplicationStatus());
                    break;
                }
            }
        }
        yarnClient.stop();
        return null;
    }

    if (dagViaRPC) {
        LOG.info("Submitting dag to tez session with appId=" + tezSession.getAppMasterApplicationId()
                + " and Dag Name=" + dag.getName());
        if (additionalLocalResources != null) {
            tezSession.addAppMasterLocalFiles(additionalLocalResources);
        }
        dagClient = tezSession.submitDAG(dag);
        Assert.assertEquals(TezAppMasterStatus.RUNNING, tezSession.getAppMasterStatus());
    }
    DAGStatus dagStatus = dagClient.getDAGStatus(null);
    while (!dagStatus.isCompleted()) {
        LOG.info(
                "Waiting for job to complete. Sleeping for 500ms." + " Current state: " + dagStatus.getState());
        Thread.sleep(500l);
        if (killDagWhileRunning && dagStatus.getState() == DAGStatus.State.RUNNING) {
            LOG.info("Killing running dag/session");
            if (dagViaRPC) {
                tezSession.stop();
            } else {
                dagClient.tryKillDAG();
            }
        }
        dagStatus = dagClient.getDAGStatus(null);
    }
    if (!reuseSession) {
        tezSession.stop();
    }
    return dagStatus.getState();
}

From source file:org.archive.hadoop.jobs.ArchiveFileExtractor.java

License:Apache License

/**
* Run the job./*from  w  w w  . j a  v  a 2s. c  o  m*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        printUsage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("Archive File Extractor");

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // turn off speculative execution
    job.setBoolean("mapred.map.tasks.speculative.execution", false);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    //tolerate task exceptions
    job.setBoolean("soft", false);

    int arg = 0;
    int numMaps = 10;

    String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n"
            + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"
            + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n";

    String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION,
            DateUtils.getLog17Date(System.currentTimeMillis()));

    while (arg < args.length - 1) {
        if (args[arg].equals("-soft")) {
            job.setBoolean("soft", true);
            arg++;
        } else if (args[arg].equals("-mappers")) {
            arg++;
            numMaps = Integer.parseInt(args[arg]);
            job.setNumMapTasks(numMaps);
            arg++;
        } else if (args[arg].equals("-timestamp14")) {
            arg++;
            String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg]));
            job.set("timestamp14", timestamp14);
            arg++;
        } else if (args[arg].equals("-warc-header-local-file")) {
            arg++;
            File f = new File(args[arg]);
            FileInputStream fis = new FileInputStream(f);
            warcHeaderString = IOUtils.toString(fis, "UTF-8");
            arg++;
        } else if (args[arg].equals("-hmacname")) {
            arg++;
            String hmacName = args[arg];
            job.set("hmacName", hmacName);
            arg++;
        } else if (args[arg].equals("-hmacsignature")) {
            arg++;
            String hmacSignature = args[arg];
            job.set("hmacSignature", hmacSignature);
            arg++;
        } else if (args[arg].equals("-timeout")) {
            arg++;
            int taskTimeout = Integer.parseInt(args[arg]);
            job.setInt("mapred.task.timeout", taskTimeout);
            arg++;
        } else if (args[arg].equals("-failpct")) {
            arg++;
            int failPct = Integer.parseInt(args[arg]);
            job.setInt("mapred.max.map.failures.percent", failPct);
            arg++;
        } else {
            break;
        }
    }

    job.set("warcHeaderString", warcHeaderString);

    if (args.length - 2 != arg) {
        printUsage();
        return 1;
    }

    Path inputPath = new Path(args[arg]);
    arg++;

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    Path outputPath = new Path(outputDir);

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(ArchiveFileExtractorMapper.class);
    job.setJarByClass(ArchiveFileExtractor.class);

    TextInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}