List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:org.apache.tez.mapreduce.processor.MRTask.java
License:Apache License
public void localizeConfiguration(JobConf jobConf) throws IOException, InterruptedException { jobConf.set(JobContext.TASK_ID, taskAttemptId.getTaskID().toString()); jobConf.set(JobContext.TASK_ATTEMPT_ID, taskAttemptId.toString()); jobConf.setInt(JobContext.TASK_PARTITION, taskAttemptId.getTaskID().getId()); jobConf.set(JobContext.ID, taskAttemptId.getJobID().toString()); jobConf.setBoolean(MRJobConfig.TASK_ISMAP, isMap); Path outputPath = FileOutputFormat.getOutputPath(jobConf); if (outputPath != null) { if ((committer instanceof FileOutputCommitter)) { FileOutputFormat.setWorkOutputPath(jobConf, ((FileOutputCommitter) committer).getTaskAttemptPath(taskAttemptContext)); } else {//from w w w . j a va2s .c o m FileOutputFormat.setWorkOutputPath(jobConf, outputPath); } } }
From source file:org.apache.tez.mapreduce.processor.reduce.ReduceProcessor.java
License:Apache License
@Override public void localizeConfiguration(JobConf jobConf) throws IOException, InterruptedException { super.localizeConfiguration(jobConf); jobConf.setBoolean(JobContext.TASK_ISMAP, false); }
From source file:org.apache.tez.mapreduce.processor.reduce.TestReduceProcessor.java
License:Apache License
@Test(timeout = 5000) public void testReduceProcessor() throws Exception { final String dagName = "mrdag0"; String mapVertexName = MultiStageMRConfigUtil.getInitialMapVertexName(); String reduceVertexName = MultiStageMRConfigUtil.getFinalReduceVertexName(); JobConf jobConf = new JobConf(defaultConf); setUpJobConf(jobConf);/*from ww w .j av a2s . c om*/ MRHelpers.translateMRConfToTez(jobConf); jobConf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString()); jobConf.setBoolean(MRJobConfig.MR_TEZ_SPLITS_VIA_EVENTS, false); Path mapInput = new Path(workDir, "map0"); MapUtils.generateInputSplit(localFs, workDir, jobConf, mapInput); InputSpec mapInputSpec = new InputSpec("NullSrcVertex", InputDescriptor.create(MRInputLegacy.class.getName()) .setUserPayload(UserPayload.create(ByteBuffer.wrap(MRRuntimeProtos.MRInputUserPayloadProto .newBuilder().setConfigurationBytes(TezUtils.createByteStringFromConf(jobConf)) .build().toByteArray()))), 1); OutputSpec mapOutputSpec = new OutputSpec("NullDestVertex", OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); // Run a map TestUmbilical testUmbilical = new TestUmbilical(); LogicalIOProcessorRuntimeTask mapTask = MapUtils.createLogicalTask(localFs, workDir, jobConf, 0, mapInput, testUmbilical, dagName, mapVertexName, Collections.singletonList(mapInputSpec), Collections.singletonList(mapOutputSpec)); mapTask.initialize(); mapTask.run(); mapTask.close(); // One VME, One DME Assert.assertEquals(2, testUmbilical.getEvents().size()); Assert.assertEquals(EventType.VERTEX_MANAGER_EVENT, testUmbilical.getEvents().get(0).getEventType()); Assert.assertEquals(EventType.COMPOSITE_DATA_MOVEMENT_EVENT, testUmbilical.getEvents().get(1).getEventType()); CompositeDataMovementEvent cdmEvent = (CompositeDataMovementEvent) testUmbilical.getEvents().get(1) .getEvent(); Assert.assertEquals(1, cdmEvent.getCount()); DataMovementEvent dme = cdmEvent.getEvents().iterator().next(); dme.setTargetIndex(0); LOG.info("Starting reduce..."); JobTokenIdentifier identifier = new JobTokenIdentifier(new Text(dagName)); JobTokenSecretManager jobTokenSecretManager = new JobTokenSecretManager(); Token<JobTokenIdentifier> shuffleToken = new Token<JobTokenIdentifier>(identifier, jobTokenSecretManager); shuffleToken.setService(identifier.getJobId()); jobConf.setOutputFormat(SequenceFileOutputFormat.class); jobConf.set(MRFrameworkConfigs.TASK_LOCAL_RESOURCE_DIR, new Path(workDir, "localized-resources").toUri().toString()); jobConf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_OPTIMIZE_LOCAL_FETCH, true); FileOutputFormat.setOutputPath(jobConf, new Path(workDir, "output")); ProcessorDescriptor reduceProcessorDesc = ProcessorDescriptor.create(ReduceProcessor.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)); InputSpec reduceInputSpec = new InputSpec(mapVertexName, InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()) .setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); OutputSpec reduceOutputSpec = new OutputSpec("NullDestinationVertex", OutputDescriptor .create(MROutputLegacy.class.getName()).setUserPayload(TezUtils.createUserPayloadFromConf(jobConf)), 1); // Now run a reduce TaskSpec taskSpec = new TaskSpec(TezTestUtils.getMockTaskAttemptId(0, 1, 0, 0), dagName, reduceVertexName, -1, reduceProcessorDesc, Collections.singletonList(reduceInputSpec), Collections.singletonList(reduceOutputSpec), null); Map<String, ByteBuffer> serviceConsumerMetadata = new HashMap<String, ByteBuffer>(); serviceConsumerMetadata.put(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, ShuffleUtils.convertJobTokenToBytes(shuffleToken)); Map<String, String> serviceProviderEnvMap = new HashMap<String, String>(); ByteBuffer shufflePortBb = ByteBuffer.allocate(4).putInt(0, 8000); AuxiliaryServiceHelper.setServiceDataIntoEnv(ShuffleUtils.SHUFFLE_HANDLER_SERVICE_ID, shufflePortBb, serviceProviderEnvMap); LogicalIOProcessorRuntimeTask task = new LogicalIOProcessorRuntimeTask(taskSpec, 0, jobConf, new String[] { workDir.toString() }, new TestUmbilical(), serviceConsumerMetadata, serviceProviderEnvMap, HashMultimap.<String, String>create(), null, "", new ExecutionContextImpl("localhost"), Runtime.getRuntime().maxMemory()); List<Event> destEvents = new LinkedList<Event>(); destEvents.add(dme); task.initialize(); OrderedGroupedInputLegacy sortedOut = (OrderedGroupedInputLegacy) task.getInputs().values().iterator() .next(); sortedOut.handleEvents(destEvents); task.run(); task.close(); // MRTask mrTask = (MRTask)t.getProcessor(); // TODO NEWTEZ Verify the partitioner has not been created // Likely not applicable anymore. // Assert.assertNull(mrTask.getPartitioner()); // Only a task commit happens, hence the data is still in the temporary directory. Path reduceOutputDir = new Path(new Path(workDir, "output"), "_temporary/0/" + IDConverter.toMRTaskIdForOutput(TezTestUtils.getMockTaskId(0, 1, 0))); Path reduceOutputFile = new Path(reduceOutputDir, "part-v001-o000-00000"); SequenceFile.Reader reader = new SequenceFile.Reader(localFs, reduceOutputFile, jobConf); LongWritable key = new LongWritable(); Text value = new Text(); long prev = Long.MIN_VALUE; while (reader.next(key, value)) { if (prev != Long.MIN_VALUE) { Assert.assertTrue(prev < key.get()); prev = key.get(); } } reader.close(); }
From source file:org.apache.tez.mapreduce.TestMRRJobsDAGApi.java
License:Apache License
public State testMRRSleepJobDagSubmitCore(boolean dagViaRPC, boolean killDagWhileRunning, boolean closeSessionBeforeSubmit, TezClient reUseTezSession, boolean genSplitsInAM, Class<? extends InputInitializer> initializerClass, Map<String, LocalResource> additionalLocalResources) throws IOException, InterruptedException, TezException, ClassNotFoundException, YarnException { LOG.info("\n\n\nStarting testMRRSleepJobDagSubmit()."); JobConf stage1Conf = new JobConf(mrrTezCluster.getConfig()); JobConf stage2Conf = new JobConf(mrrTezCluster.getConfig()); JobConf stage3Conf = new JobConf(mrrTezCluster.getConfig()); stage1Conf.setLong(MRRSleepJob.MAP_SLEEP_TIME, 1); stage1Conf.setInt(MRRSleepJob.MAP_SLEEP_COUNT, 1); stage1Conf.setInt(MRJobConfig.NUM_MAPS, 1); stage1Conf.set(MRJobConfig.MAP_CLASS_ATTR, SleepMapper.class.getName()); stage1Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName()); stage1Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName()); stage1Conf.set(MRJobConfig.INPUT_FORMAT_CLASS_ATTR, SleepInputFormat.class.getName()); stage1Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName()); stage2Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1); stage2Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1); stage2Conf.setInt(MRJobConfig.NUM_REDUCES, 1); stage2Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, ISleepReducer.class.getName()); stage2Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName()); stage2Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName()); stage2Conf.set(MRJobConfig.PARTITIONER_CLASS_ATTR, MRRSleepJobPartitioner.class.getName()); stage3Conf.setLong(MRRSleepJob.REDUCE_SLEEP_TIME, 1); stage3Conf.setInt(MRRSleepJob.REDUCE_SLEEP_COUNT, 1); stage3Conf.setInt(MRJobConfig.NUM_REDUCES, 1); stage3Conf.set(MRJobConfig.REDUCE_CLASS_ATTR, SleepReducer.class.getName()); stage3Conf.set(MRJobConfig.MAP_OUTPUT_KEY_CLASS, IntWritable.class.getName()); stage3Conf.set(MRJobConfig.MAP_OUTPUT_VALUE_CLASS, IntWritable.class.getName()); MRHelpers.translateMRConfToTez(stage1Conf); MRHelpers.translateMRConfToTez(stage2Conf); MRHelpers.translateMRConfToTez(stage3Conf); MRHelpers.configureMRApiUsage(stage1Conf); MRHelpers.configureMRApiUsage(stage2Conf); MRHelpers.configureMRApiUsage(stage3Conf); Path remoteStagingDir = remoteFs .makeQualified(new Path("/tmp", String.valueOf(new Random().nextInt(100000)))); TezClientUtils.ensureStagingDirExists(conf, remoteStagingDir); UserPayload stage1Payload = TezUtils.createUserPayloadFromConf(stage1Conf); UserPayload stage2Payload = TezUtils.createUserPayloadFromConf(stage2Conf); UserPayload stage3Payload = TezUtils.createUserPayloadFromConf(stage3Conf); DAG dag = DAG.create("testMRRSleepJobDagSubmit-" + random.nextInt(1000)); Class<? extends InputInitializer> inputInitializerClazz = genSplitsInAM ? (initializerClass == null ? MRInputAMSplitGenerator.class : initializerClass) : null;//from ww w .ja v a 2 s . c o m LOG.info("Using initializer class: " + initializerClass); DataSourceDescriptor dsd; if (!genSplitsInAM) { dsd = MRInputHelpers.configureMRInputWithLegacySplitGeneration(stage1Conf, remoteStagingDir, true); } else { if (initializerClass == null) { dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class).build(); } else { InputInitializerDescriptor iid = InputInitializerDescriptor.create(inputInitializerClazz.getName()); dsd = MRInputLegacy.createConfigBuilder(stage1Conf, SleepInputFormat.class) .setCustomInitializerDescriptor(iid).build(); } } Vertex stage1Vertex = Vertex.create("map", ProcessorDescriptor.create(MapProcessor.class.getName()).setUserPayload(stage1Payload), dsd.getNumberOfShards(), Resource.newInstance(256, 1)); stage1Vertex.addDataSource("MRInput", dsd); Vertex stage2Vertex = Vertex.create("ireduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage2Payload), 1, Resource.newInstance(256, 1)); Vertex stage3Vertex = Vertex.create("reduce", ProcessorDescriptor.create(ReduceProcessor.class.getName()).setUserPayload(stage3Payload), 1, Resource.newInstance(256, 1)); stage3Conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_CONVERT_USER_PAYLOAD_TO_HISTORY_TEXT, true); DataSinkDescriptor dataSinkDescriptor = MROutputLegacy .createConfigBuilder(stage3Conf, NullOutputFormat.class).build(); Assert.assertFalse(dataSinkDescriptor.getOutputDescriptor().getHistoryText().isEmpty()); stage3Vertex.addDataSink("MROutput", dataSinkDescriptor); // TODO env, resources dag.addVertex(stage1Vertex); dag.addVertex(stage2Vertex); dag.addVertex(stage3Vertex); Edge edge1 = Edge.create(stage1Vertex, stage2Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage2Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage2Payload))); Edge edge2 = Edge.create(stage2Vertex, stage3Vertex, EdgeProperty.create(DataMovementType.SCATTER_GATHER, DataSourceType.PERSISTED, SchedulingType.SEQUENTIAL, OutputDescriptor.create(OrderedPartitionedKVOutput.class.getName()).setUserPayload(stage3Payload), InputDescriptor.create(OrderedGroupedInputLegacy.class.getName()).setUserPayload(stage3Payload))); dag.addEdge(edge1); dag.addEdge(edge2); TezConfiguration tezConf = new TezConfiguration(mrrTezCluster.getConfig()); tezConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, remoteStagingDir.toString()); DAGClient dagClient = null; boolean reuseSession = reUseTezSession != null; TezClient tezSession = null; if (!dagViaRPC) { Preconditions.checkArgument(reuseSession == false); } if (!reuseSession) { TezConfiguration tempTezconf = new TezConfiguration(tezConf); if (!dagViaRPC) { tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, false); } else { tempTezconf.setBoolean(TezConfiguration.TEZ_AM_SESSION_MODE, true); } tezSession = TezClient.create("testsession", tempTezconf); tezSession.start(); } else { tezSession = reUseTezSession; } if (!dagViaRPC) { // TODO Use utility method post TEZ-205 to figure out AM arguments etc. dagClient = tezSession.submitDAG(dag); } if (dagViaRPC && closeSessionBeforeSubmit) { YarnClient yarnClient = YarnClient.createYarnClient(); yarnClient.init(mrrTezCluster.getConfig()); yarnClient.start(); boolean sentKillSession = false; while (true) { Thread.sleep(500l); ApplicationReport appReport = yarnClient .getApplicationReport(tezSession.getAppMasterApplicationId()); if (appReport == null) { continue; } YarnApplicationState appState = appReport.getYarnApplicationState(); if (!sentKillSession) { if (appState == YarnApplicationState.RUNNING) { tezSession.stop(); sentKillSession = true; } } else { if (appState == YarnApplicationState.FINISHED || appState == YarnApplicationState.KILLED || appState == YarnApplicationState.FAILED) { LOG.info("Application completed after sending session shutdown" + ", yarnApplicationState=" + appState + ", finalAppStatus=" + appReport.getFinalApplicationStatus()); Assert.assertEquals(YarnApplicationState.FINISHED, appState); Assert.assertEquals(FinalApplicationStatus.SUCCEEDED, appReport.getFinalApplicationStatus()); break; } } } yarnClient.stop(); return null; } if (dagViaRPC) { LOG.info("Submitting dag to tez session with appId=" + tezSession.getAppMasterApplicationId() + " and Dag Name=" + dag.getName()); if (additionalLocalResources != null) { tezSession.addAppMasterLocalFiles(additionalLocalResources); } dagClient = tezSession.submitDAG(dag); Assert.assertEquals(TezAppMasterStatus.RUNNING, tezSession.getAppMasterStatus()); } DAGStatus dagStatus = dagClient.getDAGStatus(null); while (!dagStatus.isCompleted()) { LOG.info( "Waiting for job to complete. Sleeping for 500ms." + " Current state: " + dagStatus.getState()); Thread.sleep(500l); if (killDagWhileRunning && dagStatus.getState() == DAGStatus.State.RUNNING) { LOG.info("Killing running dag/session"); if (dagViaRPC) { tezSession.stop(); } else { dagClient.tryKillDAG(); } } dagStatus = dagClient.getDAGStatus(null); } if (!reuseSession) { tezSession.stop(); } return dagStatus.getState(); }
From source file:org.apache.trevni.avro.AvroTrevniInputFormat.java
License:Apache License
@Override protected FileStatus[] listStatus(JobConf job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); job.setBoolean("mapred.input.dir.recursive", true); for (FileStatus file : super.listStatus(job)) if (file.getPath().getName().endsWith(AvroTrevniOutputFormat.EXT)) result.add(file);/*from w w w. j av a2 s . c o m*/ return result.toArray(new FileStatus[0]); }
From source file:org.archive.access.nutch.jobs.NutchwaxCrawlDb.java
License:Open Source License
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException { FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.createLockFile(fs, lock, force); if (LOG.isInfoEnabled()) { LOG.info("NutchwaxCrawlDb update: starting"); LOG.info("NutchwaxCrawlDb update: db: " + crawlDb); LOG.info("NutchwaxCrawlDb update: segment: " + Arrays.asList(segments)); LOG.info("NutchwaxCrawlDb update: additions allowed: " + additionsAllowed); LOG.info("NutchwaxCrawlDb update: URL normalizing: " + normalize); LOG.info("NutchwaxCrawlDb update: URL filtering: " + filter); }/*w w w . j ava2s .c o m*/ JobConf job = CrawlDb.createJob(getConf(), crawlDb); // Now, change the map and reduce to run. Use ours instead. job.setMapperClass(NutchwaxCrawlDbFilter.class); // Use nutch native reducer. It passes the key via the scoring // plugins but as currently implemented, they don't expect the key to // be an URL. // job.setReducerClass(CrawlDbReducer.class); job.setJobName("nutchwaxcrawldb " + crawlDb + " " + Arrays.asList(segments)); job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(NutchwaxCrawlDbFilter.URL_FILTERING, filter); job.setBoolean(NutchwaxCrawlDbFilter.URL_NORMALIZING, normalize); for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); if (fs.exists(fetch) && fs.exists(parse)) { job.addInputPath(fetch); job.addInputPath(parse); } else { LOG.info("Segment " + segments[i] + " is missing " + CrawlDatum.FETCH_DIR_NAME + " or " + CrawlDatum.PARSE_DIR_NAME + " (skipping)."); } } if (LOG.isInfoEnabled()) { LOG.info("NutchwaxCrawlDb update: Merging segment data " + Arrays.asList(segments) + " into db."); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); if (fs.exists(job.getOutputPath())) { fs.delete(job.getOutputPath()); } throw e; } NutchwaxCrawlDb.install(job, crawlDb); if (LOG.isInfoEnabled()) { LOG.info("NutchwaxCrawlDb update: done"); } }
From source file:org.archive.hadoop.jobs.ArchiveFileExtractor.java
License:Apache License
/** * Run the job.//w w w . j a va2 s. c om */ public int run(String[] args) throws Exception { if (args.length < 2) { printUsage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("Archive File Extractor"); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // turn off speculative execution job.setBoolean("mapred.map.tasks.speculative.execution", false); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); //tolerate task exceptions job.setBoolean("soft", false); int arg = 0; int numMaps = 10; String DEFAULT_WARC_PATTERN = "software: %s Extractor\r\n" + "format: WARC File Format 1.0\r\n" + "conformsTo: http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf\r\n" + "publisher: Internet Archive\r\n" + "created: %s\r\n\r\n"; String warcHeaderString = String.format(DEFAULT_WARC_PATTERN, IAUtils.COMMONS_VERSION, DateUtils.getLog17Date(System.currentTimeMillis())); while (arg < args.length - 1) { if (args[arg].equals("-soft")) { job.setBoolean("soft", true); arg++; } else if (args[arg].equals("-mappers")) { arg++; numMaps = Integer.parseInt(args[arg]); job.setNumMapTasks(numMaps); arg++; } else if (args[arg].equals("-timestamp14")) { arg++; String timestamp14 = DateUtils.get14DigitDate(DateUtils.parse14DigitDate(args[arg])); job.set("timestamp14", timestamp14); arg++; } else if (args[arg].equals("-warc-header-local-file")) { arg++; File f = new File(args[arg]); FileInputStream fis = new FileInputStream(f); warcHeaderString = IOUtils.toString(fis, "UTF-8"); arg++; } else if (args[arg].equals("-hmacname")) { arg++; String hmacName = args[arg]; job.set("hmacName", hmacName); arg++; } else if (args[arg].equals("-hmacsignature")) { arg++; String hmacSignature = args[arg]; job.set("hmacSignature", hmacSignature); arg++; } else if (args[arg].equals("-timeout")) { arg++; int taskTimeout = Integer.parseInt(args[arg]); job.setInt("mapred.task.timeout", taskTimeout); arg++; } else if (args[arg].equals("-failpct")) { arg++; int failPct = Integer.parseInt(args[arg]); job.setInt("mapred.max.map.failures.percent", failPct); arg++; } else { break; } } job.set("warcHeaderString", warcHeaderString); if (args.length - 2 != arg) { printUsage(); return 1; } Path inputPath = new Path(args[arg]); arg++; String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); Path outputPath = new Path(outputDir); job.setInputFormat(TextInputFormat.class); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(ArchiveFileExtractorMapper.class); job.setJarByClass(ArchiveFileExtractor.class); TextInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.CDXGenerator.java
License:Apache License
/** * Run the job./*from w w w .ja v a2s.com*/ */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("CDX Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating CDXs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(CDXGeneratorMapper.class); job.setJarByClass(CDXGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to CDXGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.hadoop.jobs.WATGenerator.java
License:Apache License
/** * Run the job.//from w ww .jav a 2 s .c om */ public int run(String[] args) throws Exception { if (args.length < 2) { usage(); return 1; } // Create a job configuration JobConf job = new JobConf(getConf()); // Job name uses output dir to help identify it to the operator. job.setJobName("WAT Generator " + args[0]); // The inputs are a list of filenames, use the // FilenameInputFormat to pass them to the mappers. job.setInputFormat(FilenameInputFormat.class); // This is a map-only job, no reducers. job.setNumReduceTasks(0); // set timeout to a high value - 20 hours job.setInt("mapred.task.timeout", 72000000); // keep job running despite some failures in generating WATs job.setBoolean("strictMode", false); job.setOutputFormat(TextOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setMapperClass(WATGeneratorMapper.class); job.setJarByClass(WATGenerator.class); int arg = 0; if (args[arg].equals("-strictMode")) { job.setBoolean("strictMode", true); arg++; } String outputDir = args[arg]; arg++; job.set("outputDir", outputDir); FileOutputFormat.setOutputPath(job, new Path(outputDir)); boolean atLeastOneInput = false; for (int i = arg; i < args.length; i++) { FileSystem inputfs = FileSystem.get(new java.net.URI(args[i]), getConf()); for (FileStatus status : inputfs.globStatus(new Path(args[i]))) { Path inputPath = status.getPath(); atLeastOneInput = true; LOG.info("Add input path: " + inputPath); FileInputFormat.addInputPath(job, inputPath); } } if (!atLeastOneInput) { LOG.info("No input files to WATGenerator."); return 0; } // Run the job! RunningJob rj = JobClient.runJob(job); if (!rj.isSuccessful()) { LOG.error("FAILED: " + rj.getID()); return 2; } return 0; }
From source file:org.archive.nutchwax.PageRankDb.java
License:Apache License
private static JobConf createJob(Configuration config, Path pageRankDb, boolean normalize, boolean filter) { Path newPageRankDb = new Path("pagerankdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("pagerankdb " + pageRankDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(PageRankDb.class); job.setCombinerClass(PageRankDbMerger.class); // if we don't run the mergeJob, perform normalization/filtering now if (normalize || filter) { try {/*from w w w. j a v a 2 s. c om*/ FileSystem fs = FileSystem.get(config); if (!fs.exists(pageRankDb)) { job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); } } catch (Exception e) { LOG.warn("PageRankDb createJob: " + e); } } job.setReducerClass(PageRankDbMerger.class); FileOutputFormat.setOutputPath(job, newPageRankDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", false); job.setOutputKeyClass(Text.class); // DIFF: Use IntWritable instead of Inlinks as the output value type. job.setOutputValueClass(IntWritable.class); return job; }