Example usage for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value)

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:org.apache.tez.mapreduce.processor.MRTask.java

License:Apache License

public void localizeConfiguration(JobConf jobConf) throws IOException, InterruptedException {
    jobConf.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString());
    jobConf.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString());
    jobConf.setInt(JobContext.TASK_PARTITION, taskAttemptId.getTaskID().getId());
    jobConf.set(JobContext.ID, taskAttemptId.getJobID().toString());

    jobConf.setBoolean(MRJobConfig.TASK_ISMAP, isMap);

    Path outputPath = FileOutputFormat.getOutputPath(jobConf);
    if (outputPath != null) {
        if ((committer instanceof FileOutputCommitter)) {
            FileOutputFormat.setWorkOutputPath(jobConf,
                    ((FileOutputCommitter) committer).getTaskAttemptPath(taskAttemptContext));
        } else {//from  w w  w .  j a va2s  .c  o  m
            FileOutputFormat.setWorkOutputPath(jobConf, outputPath);
        }
    }
}

From source file:org.apache.tez.mapreduce.processor.reduce.ReduceProcessor.java

License:Apache License

@Override
public void localizeConfiguration(JobConf jobConf) throws IOException, InterruptedException {
    super.localizeConfiguration(jobConf);
    jobConf.setBoolean(JobContext.TASK_ISMAP, false);
}

From source file:org.apache.tez.mapreduce.processor.reduce.TestReduceProcessor.java

License:Apache License

@Test(timeout = 5000)
public void testReduceProcessor() throws Exception {
    final String dagName = "mrdag0";
    String mapVertexName = MultiStageMRConfigUtil.getInitialMapVertexName();
    String reduceVertexName = MultiStageMRConfigUtil.getFinalReduceVertexName();
    JobConf jobConf = new JobConf(defaultConf);
    setUpJobConf(jobConf);/*from ww  w  .j  av  a2s  .  c om*/

    MRHelpers.translateMRConfToTez(jobConf);
    jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);

    jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
            new Path(workDir, "localized-resources").toUri().toString());
    jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false);

    Path mapInput = new Path(workDir, "map0");
    MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput);

    InputSpec mapInputSpec = new InputSpec("NullSrcVertex",
            InputDescriptor.create(MRInputLegacy.class.getName())
                    .setUserPayload(UserPayload.create(ByteBuffer.wrap(MRRuntimeProtos.MRInputUserPayloadProto
                            .newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf))
                            .build().toByteArray()))),
            1);
    OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex",
            OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName())
                    .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);
    // Run a map

    TestUmbilical testUmbilical = new TestUmbilical();

    LogicalIOProcessorRuntimeTask mapTask = MapUtils.createLogicalTask(localFs, workDir, jobConf, 0, mapInput,
            testUmbilical, dagName, mapVertexName, Collections.singletonList(mapInputSpec),
            Collections.singletonList(mapOutputSpec));

    mapTask.initialize();
    mapTask.run();
    mapTask.close();

    // One VME, One DME
    Assert.assertEquals(2, testUmbilical.getEvents().size());
    Assert.assertEquals(EventType.VERTEX_MANAGER_EVENT, testUmbilical.getEvents().get(0).getEventType());
    Assert.assertEquals(EventType.COMPOSITE_DATA_MOVEMENT_EVENT,
            testUmbilical.getEvents().get(1).getEventType());

    CompositeDataMovementEvent cdmEvent = (CompositeDataMovementEvent) testUmbilical.getEvents().get(1)
            .getEvent();
    Assert.assertEquals(1, cdmEvent.getCount());
    DataMovementEvent dme = cdmEvent.getEvents().iterator().next();
    dme.setTargetIndex(0);

    LOG.info("Starting reduce...");

    JobTokenIdentifier identifier = new JobTokenIdentifier(new Text(dagName));
    JobTokenSecretManager jobTokenSecretManager = new JobTokenSecretManager();
    Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>(identifier, jobTokenSecretManager);
    shuffleToken.setService(identifier.getJobId());

    jobConf.setOutputFormat(SequenceFileOutputFormat.class);
    jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR,
            new Path(workDir, "localized-resources").toUri().toString());
    jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_OPTIMIZE_LOCAL_FETCH, true);
    FileOutputFormat.setOutputPath(jobConf, new Path(workDir, "output"));
    ProcessorDescriptor reduceProcessorDesc = ProcessorDescriptor.create(ReduceProcessor.class.getName())
            .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf));

    InputSpec reduceInputSpec = new InputSpec(mapVertexName,
            InputDescriptor.create(OrderedGroupedInputLegacy.class.getName())
                    .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);
    OutputSpec reduceOutputSpec = new OutputSpec("NullDestinationVertex", OutputDescriptor
            .create(MROutputLegacy.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)),
            1);

    // Now run a reduce
    TaskSpec taskSpec = new TaskSpec(TezTestUtils.getMockTaskAttemptId(0, 1, 0, 0), dagName, reduceVertexName,
            -1, reduceProcessorDesc, Collections.singletonList(reduceInputSpec),
            Collections.singletonList(reduceOutputSpec), null);

    Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>();
    serviceConsumerMetadata.put(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID,
            ShuffleUtils.convertJobTokenToBytes(shuffleToken));
    Map<String, String> serviceProviderEnvMap = new HashMap<String, String>();
    ByteBuffer shufflePortBb = ByteBuffer.allocate(4).putInt(0, 8000);
    AuxiliaryServiceHelper.setServiceDataIntoEnv(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, shufflePortBb,
            serviceProviderEnvMap);

    LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask(taskSpec, 0, jobConf,
            new String[] { workDir.toString() }, new TestUmbilical(), serviceConsumerMetadata,
            serviceProviderEnvMap, HashMultimap.<String, String>create(), null, "",
            new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory());

    List<Event> destEvents = new LinkedList<Event>();
    destEvents.add(dme);
    task.initialize();
    OrderedGroupedInputLegacy sortedOut = (OrderedGroupedInputLegacy) task.getInputs().values().iterator()
            .next();
    sortedOut.handleEvents(destEvents);
    task.run();
    task.close();

    // MRTask mrTask = (MRTask)t.getProcessor();
    // TODO NEWTEZ Verify the partitioner has not been created
    // Likely not applicable anymore.
    // Assert.assertNull(mrTask.getPartitioner());

    // Only a task commit happens, hence the data is still in the temporary directory.
    Path reduceOutputDir = new Path(new Path(workDir, "output"),
            "_temporary/0/" + IDConverter.toMRTaskIdForOutput(TezTestUtils.getMockTaskId(0, 1, 0)));

    Path reduceOutputFile = new Path(reduceOutputDir, "part-v001-o000-00000");

    SequenceFile.Reader reader = new SequenceFile.Reader(localFs, reduceOutputFile, jobConf);

    LongWritable key = new LongWritable();
    Text value = new Text();
    long prev = Long.MIN_VALUE;
    while (reader.next(key, value)) {
        if (prev != Long.MIN_VALUE) {
            Assert.assertTrue(prev < key.get());
            prev = key.get();
        }
    }

    reader.close();
}

From source file:org.apache.tez.mapreduce.TestMRRJobsDAGApi.java

License:Apache License

public State testMRRSleepJobDagSubmitCore(boolean dagViaRPC, boolean killDagWhileRunning,
        boolean closeSessionBeforeSubmit, TezClient reUseTezSession, boolean genSplitsInAM,
        Class<? extends InputInitializer> initializerClass, Map<String, LocalResource> additionalLocalResources)
        throws IOException, InterruptedException, TezException, ClassNotFoundException, YarnException {
    LOG.info("\n\n\nStarting testMRRSleepJobDagSubmit().");

    JobConf stage1Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage2Conf = new JobConf(mrrTezCluster.getConfig());
    JobConf stage3Conf = new JobConf(mrrTezCluster.getConfig());

    stage1Conf.setLong(MRRSleepJob.MAP_SLEEP_TIME, 1);
    stage1Conf.setInt(MRRSleepJob.MAP_SLEEP_COUNT, 1);
    stage1Conf.setInt(MRJobConfig.NUM_MAPS, 1);
    stage1Conf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage1Conf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName());
    stage1Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());

    stage2Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage2Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage2Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage2Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());
    stage2Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName());

    stage3Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1);
    stage3Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1);
    stage3Conf.setInt(MRJobConfig.NUM_REDUCES, 1);
    stage3Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName());
    stage3Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName());

    MRHelpers.translateMRConfToTez(stage1Conf);
    MRHelpers.translateMRConfToTez(stage2Conf);
    MRHelpers.translateMRConfToTez(stage3Conf);
    MRHelpers.configureMRApiUsage(stage1Conf);
    MRHelpers.configureMRApiUsage(stage2Conf);
    MRHelpers.configureMRApiUsage(stage3Conf);

    Path remoteStagingDir = remoteFs
            .makeQualified(new Path("/tmp", String.valueOf(new Random().nextInt(100000))));
    TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir);

    UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf);
    UserPayload stage2Payload = TezUtils.createUserPayloadFromConf(stage2Conf);
    UserPayload stage3Payload = TezUtils.createUserPayloadFromConf(stage3Conf);

    DAG dag = DAG.create("testMRRSleepJobDagSubmit-" + random.nextInt(1000));

    Class<? extends InputInitializer> inputInitializerClazz = genSplitsInAM
            ? (initializerClass == null ? MRInputAMSplitGenerator.class : initializerClass)
            : null;//from  ww w .ja v  a  2  s .  c  o m
    LOG.info("Using initializer class: " + initializerClass);

    DataSourceDescriptor dsd;
    if (!genSplitsInAM) {
        dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, remoteStagingDir, true);
    } else {
        if (initializerClass == null) {
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).build();
        } else {
            InputInitializerDescriptor iid = InputInitializerDescriptor.create(inputInitializerClazz.getName());
            dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class)
                    .setCustomInitializerDescriptor(iid).build();
        }
    }

    Vertex stage1Vertex = Vertex.create("map",
            ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(stage1Payload),
            dsd.getNumberOfShards(), Resource.newInstance(256, 1));
    stage1Vertex.addDataSource("MRInput", dsd);
    Vertex stage2Vertex = Vertex.create("ireduce",
            ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage2Payload), 1,
            Resource.newInstance(256, 1));
    Vertex stage3Vertex = Vertex.create("reduce",
            ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage3Payload), 1,
            Resource.newInstance(256, 1));
    stage3Conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, true);
    DataSinkDescriptor dataSinkDescriptor = MROutputLegacy
            .createConfigBuilder(stage3Conf, NullOutputFormat.class).build();
    Assert.assertFalse(dataSinkDescriptor.getOutputDescriptor().getHistoryText().isEmpty());
    stage3Vertex.addDataSink("MROutput", dataSinkDescriptor);

    // TODO env, resources

    dag.addVertex(stage1Vertex);
    dag.addVertex(stage2Vertex);
    dag.addVertex(stage3Vertex);

    Edge edge1 = Edge.create(stage1Vertex, stage2Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER,
            DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL,
            OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage2Payload),
            InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage2Payload)));
    Edge edge2 = Edge.create(stage2Vertex, stage3Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER,
            DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL,
            OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage3Payload),
            InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage3Payload)));

    dag.addEdge(edge1);
    dag.addEdge(edge2);

    TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig());
    tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString());

    DAGClient dagClient = null;
    boolean reuseSession = reUseTezSession != null;
    TezClient tezSession = null;
    if (!dagViaRPC) {
        Preconditions.checkArgument(reuseSession == false);
    }
    if (!reuseSession) {
        TezConfiguration tempTezconf = new TezConfiguration(tezConf);
        if (!dagViaRPC) {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false);
        } else {
            tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true);
        }
        tezSession = TezClient.create("testsession", tempTezconf);
        tezSession.start();
    } else {
        tezSession = reUseTezSession;
    }
    if (!dagViaRPC) {
        // TODO Use utility method post TEZ-205 to figure out AM arguments etc.
        dagClient = tezSession.submitDAG(dag);
    }

    if (dagViaRPC && closeSessionBeforeSubmit) {
        YarnClient yarnClient = YarnClient.createYarnClient();
        yarnClient.init(mrrTezCluster.getConfig());
        yarnClient.start();
        boolean sentKillSession = false;
        while (true) {
            Thread.sleep(500l);
            ApplicationReport appReport = yarnClient
                    .getApplicationReport(tezSession.getAppMasterApplicationId());
            if (appReport == null) {
                continue;
            }
            YarnApplicationState appState = appReport.getYarnApplicationState();
            if (!sentKillSession) {
                if (appState == YarnApplicationState.RUNNING) {
                    tezSession.stop();
                    sentKillSession = true;
                }
            } else {
                if (appState == YarnApplicationState.FINISHED || appState == YarnApplicationState.KILLED
                        || appState == YarnApplicationState.FAILED) {
                    LOG.info("Application completed after sending session shutdown" + ", yarnApplicationState="
                            + appState + ", finalAppStatus=" + appReport.getFinalApplicationStatus());
                    Assert.assertEquals(YarnApplicationState.FINISHED, appState);
                    Assert.assertEquals(FinalApplicationStatus.SUCCEEDED,
                            appReport.getFinalApplicationStatus());
                    break;
                }
            }
        }
        yarnClient.stop();
        return null;
    }

    if (dagViaRPC) {
        LOG.info("Submitting dag to tez session with appId=" + tezSession.getAppMasterApplicationId()
                + " and Dag Name=" + dag.getName());
        if (additionalLocalResources != null) {
            tezSession.addAppMasterLocalFiles(additionalLocalResources);
        }
        dagClient = tezSession.submitDAG(dag);
        Assert.assertEquals(TezAppMasterStatus.RUNNING, tezSession.getAppMasterStatus());
    }
    DAGStatus dagStatus = dagClient.getDAGStatus(null);
    while (!dagStatus.isCompleted()) {
        LOG.info(
                "Waiting for job to complete. Sleeping for 500ms." + " Current state: " + dagStatus.getState());
        Thread.sleep(500l);
        if (killDagWhileRunning && dagStatus.getState() == DAGStatus.State.RUNNING) {
            LOG.info("Killing running dag/session");
            if (dagViaRPC) {
                tezSession.stop();
            } else {
                dagClient.tryKillDAG();
            }
        }
        dagStatus = dagClient.getDAGStatus(null);
    }
    if (!reuseSession) {
        tezSession.stop();
    }
    return dagStatus.getState();
}

From source file:org.apache.trevni.avro.AvroTrevniInputFormat.java

License:Apache License

@Override
protected FileStatus[] listStatus(JobConf job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    job.setBoolean("mapred.input.dir.recursive", true);
    for (FileStatus file : super.listStatus(job))
        if (file.getPath().getName().endsWith(AvroTrevniOutputFormat.EXT))
            result.add(file);/*from   w  w  w. j  av  a2 s . c  o  m*/
    return result.toArray(new FileStatus[0]);
}

From source file:org.archive.access.nutch.jobs.NutchwaxCrawlDb.java

License:Open Source License

public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed,
        boolean force) throws IOException {
    FileSystem fs = FileSystem.get(getConf());
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.createLockFile(fs, lock, force);

    if (LOG.isInfoEnabled()) {
        LOG.info("NutchwaxCrawlDb update: starting");
        LOG.info("NutchwaxCrawlDb update: db: " + crawlDb);
        LOG.info("NutchwaxCrawlDb update: segment: " + Arrays.asList(segments));
        LOG.info("NutchwaxCrawlDb update: additions allowed: " + additionsAllowed);
        LOG.info("NutchwaxCrawlDb update: URL normalizing: " + normalize);
        LOG.info("NutchwaxCrawlDb update: URL filtering: " + filter);
    }/*w w w  . j ava2s  .c  o  m*/

    JobConf job = CrawlDb.createJob(getConf(), crawlDb);

    // Now, change the map and reduce to run.  Use ours instead.
    job.setMapperClass(NutchwaxCrawlDbFilter.class);

    // Use nutch native reducer.  It passes the key via the scoring
    // plugins but as currently implemented, they don't expect the key to
    // be an URL.
    // job.setReducerClass(CrawlDbReducer.class);
    job.setJobName("nutchwaxcrawldb " + crawlDb + " " + Arrays.asList(segments));

    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
    job.setBoolean(NutchwaxCrawlDbFilter.URL_FILTERING, filter);
    job.setBoolean(NutchwaxCrawlDbFilter.URL_NORMALIZING, normalize);

    for (int i = 0; i < segments.length; i++) {
        Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
        Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);

        if (fs.exists(fetch) && fs.exists(parse)) {
            job.addInputPath(fetch);
            job.addInputPath(parse);
        } else {
            LOG.info("Segment " + segments[i] + " is missing " + CrawlDatum.FETCH_DIR_NAME + " or "
                    + CrawlDatum.PARSE_DIR_NAME + " (skipping).");
        }
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("NutchwaxCrawlDb update: Merging segment data " + Arrays.asList(segments) + " into db.");
    }

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);

        if (fs.exists(job.getOutputPath())) {
            fs.delete(job.getOutputPath());
        }

        throw e;
    }

    NutchwaxCrawlDb.install(job, crawlDb);

    if (LOG.isInfoEnabled()) {
        LOG.info("NutchwaxCrawlDb update: done");
    }
}

From source file:org.archive.hadoop.jobs.ArchiveFileExtractor.java

License:Apache License

/**
* Run the job.//w  w  w  . j a  va2  s. c  om
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        printUsage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("Archive File Extractor");

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // turn off speculative execution
    job.setBoolean("mapred.map.tasks.speculative.execution", false);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    //tolerate task exceptions
    job.setBoolean("soft", false);

    int arg = 0;
    int numMaps = 10;

    String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n"
            + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n"
            + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n";

    String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION,
            DateUtils.getLog17Date(System.currentTimeMillis()));

    while (arg < args.length - 1) {
        if (args[arg].equals("-soft")) {
            job.setBoolean("soft", true);
            arg++;
        } else if (args[arg].equals("-mappers")) {
            arg++;
            numMaps = Integer.parseInt(args[arg]);
            job.setNumMapTasks(numMaps);
            arg++;
        } else if (args[arg].equals("-timestamp14")) {
            arg++;
            String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg]));
            job.set("timestamp14", timestamp14);
            arg++;
        } else if (args[arg].equals("-warc-header-local-file")) {
            arg++;
            File f = new File(args[arg]);
            FileInputStream fis = new FileInputStream(f);
            warcHeaderString = IOUtils.toString(fis, "UTF-8");
            arg++;
        } else if (args[arg].equals("-hmacname")) {
            arg++;
            String hmacName = args[arg];
            job.set("hmacName", hmacName);
            arg++;
        } else if (args[arg].equals("-hmacsignature")) {
            arg++;
            String hmacSignature = args[arg];
            job.set("hmacSignature", hmacSignature);
            arg++;
        } else if (args[arg].equals("-timeout")) {
            arg++;
            int taskTimeout = Integer.parseInt(args[arg]);
            job.setInt("mapred.task.timeout", taskTimeout);
            arg++;
        } else if (args[arg].equals("-failpct")) {
            arg++;
            int failPct = Integer.parseInt(args[arg]);
            job.setInt("mapred.max.map.failures.percent", failPct);
            arg++;
        } else {
            break;
        }
    }

    job.set("warcHeaderString", warcHeaderString);

    if (args.length - 2 != arg) {
        printUsage();
        return 1;
    }

    Path inputPath = new Path(args[arg]);
    arg++;

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    Path outputPath = new Path(outputDir);

    job.setInputFormat(TextInputFormat.class);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(ArchiveFileExtractorMapper.class);
    job.setJarByClass(ArchiveFileExtractor.class);

    TextInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.CDXGenerator.java

License:Apache License

/**
* Run the job./*from w w  w .ja v  a2s.com*/
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("CDX Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating CDXs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(CDXGeneratorMapper.class);
    job.setJarByClass(CDXGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to CDXGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.hadoop.jobs.WATGenerator.java

License:Apache License

/**
* Run the job.//from   w  ww .jav  a 2  s  .c om
*/
public int run(String[] args) throws Exception {
    if (args.length < 2) {
        usage();
        return 1;
    }

    // Create a job configuration
    JobConf job = new JobConf(getConf());

    // Job name uses output dir to help identify it to the operator.
    job.setJobName("WAT Generator " + args[0]);

    // The inputs are a list of filenames, use the
    // FilenameInputFormat to pass them to the mappers.
    job.setInputFormat(FilenameInputFormat.class);

    // This is a map-only job, no reducers.
    job.setNumReduceTasks(0);

    // set timeout to a high value - 20 hours
    job.setInt("mapred.task.timeout", 72000000);

    // keep job running despite some failures in generating WATs
    job.setBoolean("strictMode", false);

    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(WATGeneratorMapper.class);
    job.setJarByClass(WATGenerator.class);

    int arg = 0;
    if (args[arg].equals("-strictMode")) {
        job.setBoolean("strictMode", true);
        arg++;
    }

    String outputDir = args[arg];
    arg++;

    job.set("outputDir", outputDir);
    FileOutputFormat.setOutputPath(job, new Path(outputDir));

    boolean atLeastOneInput = false;
    for (int i = arg; i < args.length; i++) {
        FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf());
        for (FileStatus status : inputfs.globStatus(new Path(args[i]))) {
            Path inputPath = status.getPath();
            atLeastOneInput = true;
            LOG.info("Add input path: " + inputPath);
            FileInputFormat.addInputPath(job, inputPath);
        }
    }
    if (!atLeastOneInput) {
        LOG.info("No input files to WATGenerator.");
        return 0;
    }

    // Run the job!
    RunningJob rj = JobClient.runJob(job);
    if (!rj.isSuccessful()) {
        LOG.error("FAILED: " + rj.getID());
        return 2;
    }
    return 0;
}

From source file:org.archive.nutchwax.PageRankDb.java

License:Apache License

private static JobConf createJob(Configuration config, Path pageRankDb, boolean normalize, boolean filter) {
    Path newPageRankDb = new Path("pagerankdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("pagerankdb " + pageRankDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(PageRankDb.class);
    job.setCombinerClass(PageRankDbMerger.class);
    // if we don't run the mergeJob, perform normalization/filtering now
    if (normalize || filter) {
        try {/*from   w  w  w. j a v a 2  s.  c  om*/
            FileSystem fs = FileSystem.get(config);
            if (!fs.exists(pageRankDb)) {
                job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
                job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
            }
        } catch (Exception e) {
            LOG.warn("PageRankDb createJob: " + e);
        }
    }
    job.setReducerClass(PageRankDbMerger.class);

    FileOutputFormat.setOutputPath(job, newPageRankDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", false);
    job.setOutputKeyClass(Text.class);

    // DIFF: Use IntWritable instead of Inlinks as the output value type.
    job.setOutputValueClass(IntWritable.class);

    return job;
}