List of usage examples for org.apache.hadoop.mapred JobConf setMapRunnerClass
public void setMapRunnerClass(Class<? extends MapRunnable> theClass)
From source file:org.apache.nutch.fetcher.OldFetcher.java
License:Apache License
public void fetch(Path segment, int threads) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("OldFetcher: starting at " + sdf.format(start)); LOG.info("OldFetcher: segment: " + segment); }// ww w .j a v a 2 s .co m JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(OldFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("OldFetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.nutch.selenium.fetcher.SeleniumFetcher.java
License:Apache License
public void fetch(Path segment, int threads, String zippedDriverPath) throws IOException, URISyntaxException { checkConfiguration();//from ww w . ja v a 2s .co m SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Fetcher: starting at " + sdf.format(start)); LOG.info("Fetcher: segment: " + segment); } // set the actual time for the timelimit relative // to the beginning of the whole job and not of a specific task // otherwise it keeps trying again if a task fails long timelimit = getConf().getLong("fetcher.timelimit.mins", -1); if (timelimit != -1) { timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); LOG.info("Fetcher Timelimit set for : " + timelimit); getConf().setLong("fetcher.timelimit", timelimit); } // Set the time limit after which the throughput threshold feature is enabled timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10); timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000); getConf().setLong("fetcher.throughput.threshold.check.after", timelimit); int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1); if (maxOutlinkDepth > 0) { LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth)); int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4); int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2); int totalOutlinksToFollow = 0; for (int i = 0; i < maxOutlinkDepth; i++) { totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks); } LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow)); } JobConf job = new NutchJob(getConf()); job.setJobName("fetch " + segment); job.setInt("fetcher.threads.fetch", threads); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); // for politeness, don't permit parallel execution of a single task job.setSpeculativeExecution(false); // push the zipped_webdriver binaries onto the DistributedCache DistributedCache.addCacheArchive(new URI(zippedDriverPath), job); job.set("webdriver.binaries.path", zippedDriverPath); FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME)); job.setInputFormat(InputFormat.class); job.setMapRunnerClass(SeleniumFetcher.class); FileOutputFormat.setOutputPath(job, segment); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); long end = System.currentTimeMillis(); LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:org.apache.pig.backend.hadoop.executionengine.mapreduceExec.MapReduceLauncher.java
License:Apache License
/** * Submit a Pig job to hadoop.// w w w. j a va 2 s. co m * * @param mapFuncs * a list of map functions to apply to the inputs. The cardinality of the list should * be the same as input's cardinality. * @param groupFuncs * a list of grouping functions to apply to the inputs. The cardinality of the list * should be the same as input's cardinality. * @param reduceFunc * the reduce function. * @param mapTasks * the number of map tasks to use. * @param reduceTasks * the number of reduce tasks to use. * @param input * a list of inputs * @param output * the path of the output. * @return an indicator of success or failure. * @throws IOException */ public boolean launchPig(POMapreduce pom) throws IOException { JobConf conf = new JobConf(config); setJobProperties(conf, pom); Properties properties = pom.pigContext.getProperties(); ConfigurationValidator.validatePigProperties(properties); String jobName = properties.getProperty(PigContext.JOB_NAME); conf.setJobName(jobName); boolean success = false; List<String> funcs = new ArrayList<String>(); if (pom.toMap != null) { for (EvalSpec es : pom.toMap) funcs.addAll(es.getFuncs()); } if (pom.groupFuncs != null) { for (EvalSpec es : pom.groupFuncs) funcs.addAll(es.getFuncs()); } if (pom.toReduce != null) { funcs.addAll(pom.toReduce.getFuncs()); } // create jobs.jar locally and pass it to hadoop File submitJarFile = File.createTempFile("Job", ".jar"); try { FileOutputStream fos = new FileOutputStream(submitJarFile); JarManager.createJar(fos, funcs, null, pom.pigContext); log.debug("Job jar size = " + submitJarFile.length()); conf.setJar(submitJarFile.getPath()); String user = System.getProperty("user.name"); conf.setUser(user != null ? user : "Pigster"); conf.set("pig.spill.size.threshold", properties.getProperty("pig.spill.size.threshold")); conf.set("pig.spill.gc.activation.size", properties.getProperty("pig.spill.gc.activation.size")); if (pom.reduceParallelism != -1) { conf.setNumReduceTasks(pom.reduceParallelism); } if (pom.toMap != null) { conf.set("pig.mapFuncs", ObjectSerializer.serialize(pom.toMap)); } if (pom.toCombine != null) { conf.set("pig.combineFunc", ObjectSerializer.serialize(pom.toCombine)); // this is to make sure that combiner is only called once // since we can't handle no combine or multiple combines conf.setCombineOnceOnly(true); } if (pom.groupFuncs != null) { conf.set("pig.groupFuncs", ObjectSerializer.serialize(pom.groupFuncs)); } if (pom.toReduce != null) { conf.set("pig.reduceFunc", ObjectSerializer.serialize(pom.toReduce)); } if (pom.toSplit != null) { conf.set("pig.splitSpec", ObjectSerializer.serialize(pom.toSplit)); } if (pom.pigContext != null) { conf.set("pig.pigContext", ObjectSerializer.serialize(pom.pigContext)); } conf.setMapRunnerClass(PigMapReduce.class); if (pom.toCombine != null) { conf.setCombinerClass(PigCombine.class); //conf.setCombinerClass(PigMapReduce.class); } if (pom.quantilesFile != null) { conf.set("pig.quantilesFile", pom.quantilesFile); } else { // this is not a sort job - can use byte comparison to speed up processing conf.setOutputKeyComparatorClass(PigWritableComparator.class); } if (pom.partitionFunction != null) { conf.setPartitionerClass(SortPartitioner.class); } conf.setReducerClass(PigMapReduce.class); conf.setInputFormat(PigInputFormat.class); conf.setOutputFormat(PigOutputFormat.class); // not used starting with 0.15 conf.setInputKeyClass(Text.class); // not used starting with 0.15 conf.setInputValueClass(Tuple.class); conf.setOutputKeyClass(Tuple.class); if (pom.userComparator != null) { conf.setOutputKeyComparatorClass(pom.userComparator); } conf.setOutputValueClass(IndexedTuple.class); conf.set("pig.inputs", ObjectSerializer.serialize(pom.inputFileSpecs)); conf.setOutputPath(new Path(pom.outputFileSpec.getFileName())); conf.set("pig.storeFunc", ObjectSerializer.serialize(pom.outputFileSpec.getFuncSpec())); // Setup the DistributedCache for this job setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.ship.files", true); setupDistributedCache(pom.pigContext, conf, pom.properties, "pig.streaming.cache.files", false); // Setup the logs directory for this job String jobOutputFileName = pom.pigContext.getJobOutputFile(); if (jobOutputFileName != null && jobOutputFileName.length() > 0) { Path jobOutputFile = new Path(pom.pigContext.getJobOutputFile()); conf.set("pig.output.dir", jobOutputFile.getParent().toString()); conf.set("pig.streaming.log.dir", new Path(jobOutputFile, LOG_DIR).toString()); } // // Now, actually submit the job (using the submit name) // JobClient jobClient = execEngine.getJobClient(); RunningJob status = jobClient.submitJob(conf); log.debug("submitted job: " + status.getJobID()); long sleepTime = 1000; double lastQueryProgress = -1.0; int lastJobsQueued = -1; double lastMapProgress = -1.0; double lastReduceProgress = -1.0; while (true) { try { Thread.sleep(sleepTime); } catch (Exception e) { } if (status.isComplete()) { success = status.isSuccessful(); if (log.isDebugEnabled()) { StringBuilder sb = new StringBuilder(); sb.append("Job finished "); sb.append((success ? "" : "un")); sb.append("successfully"); log.debug(sb.toString()); } if (success) { mrJobNumber++; } double queryProgress = ((double) mrJobNumber) / ((double) numMRJobs); if (queryProgress > lastQueryProgress) { if (log.isInfoEnabled()) { StringBuilder sbProgress = new StringBuilder(); sbProgress.append("Pig progress = "); sbProgress.append(((int) (queryProgress * 100))); sbProgress.append("%"); log.info(sbProgress.toString()); } lastQueryProgress = queryProgress; } break; } else // still running { double mapProgress = status.mapProgress(); double reduceProgress = status.reduceProgress(); if (lastMapProgress != mapProgress || lastReduceProgress != reduceProgress) { if (log.isDebugEnabled()) { StringBuilder sbProgress = new StringBuilder(); sbProgress.append("Hadoop job progress: Map="); sbProgress.append((int) (mapProgress * 100)); sbProgress.append("% Reduce="); sbProgress.append((int) (reduceProgress * 100)); sbProgress.append("%"); log.debug(sbProgress.toString()); } lastMapProgress = mapProgress; lastReduceProgress = reduceProgress; } double numJobsCompleted = mrJobNumber; double thisJobProgress = (mapProgress + reduceProgress) / 2.0; double queryProgress = (numJobsCompleted + thisJobProgress) / ((double) numMRJobs); if (queryProgress > lastQueryProgress) { if (log.isInfoEnabled()) { StringBuilder sbProgress = new StringBuilder(); sbProgress.append("Pig progress = "); sbProgress.append(((int) (queryProgress * 100))); sbProgress.append("%"); log.info(sbProgress.toString()); } lastQueryProgress = queryProgress; } } } // bug 1030028: if the input file is empty; hadoop doesn't create the output file! Path outputFile = conf.getOutputPath(); String outputName = outputFile.getName(); int colon = outputName.indexOf(':'); if (colon != -1) { outputFile = new Path(outputFile.getParent(), outputName.substring(0, colon)); } try { ElementDescriptor descriptor = ((HDataStorage) (pom.pigContext.getDfs())) .asElement(outputFile.toString()); if (success && !descriptor.exists()) { // create an empty output file PigFile f = new PigFile(outputFile.toString(), false); f.store(BagFactory.getInstance().newDefaultBag(), new PigStorage(), pom.pigContext); } } catch (DataStorageException e) { throw WrappedIOException.wrap("Failed to obtain descriptor for " + outputFile.toString(), e); } if (!success) { // go find the error messages getErrorMessages(jobClient.getMapTaskReports(status.getJobID()), "map"); getErrorMessages(jobClient.getReduceTaskReports(status.getJobID()), "reduce"); } else { long timeSpent = 0; // NOTE: this call is crashing due to a bug in Hadoop; the bug is known and the patch has not been applied yet. TaskReport[] mapReports = jobClient.getMapTaskReports(status.getJobID()); TaskReport[] reduceReports = jobClient.getReduceTaskReports(status.getJobID()); for (TaskReport r : mapReports) { timeSpent += (r.getFinishTime() - r.getStartTime()); } for (TaskReport r : reduceReports) { timeSpent += (r.getFinishTime() - r.getStartTime()); } totalHadoopTimeSpent += timeSpent; } } catch (Exception e) { // Do we need different handling for different exceptions e.printStackTrace(); throw WrappedIOException.wrap(e); } finally { submitJarFile.delete(); } return success; }
From source file:org.archive.access.nutch.jobs.ImportArcs.java
License:LGPL
public void importArcs(final Path arcUrlsDir, final Path segment, final String collection) throws IOException { LOG.info("ImportArcs segment: " + segment + ", src: " + arcUrlsDir); final JobConf job = new JobConf(getConf(), this.getClass()); job.set(Nutch.SEGMENT_NAME_KEY, segment.getName()); job.setInputPath(arcUrlsDir);//from w ww.j a va 2s.c o m //job.setMapRunnerClass(job.getClass("wax.import.maprunner", ARCMapRunner.class)); //job.setMapperClass(job.getClass("wax.import.mapper", this.getClass())); job.setMapRunnerClass(ARCMapRunner.class); // compatible with hadoop 0.14 TODO MC job.setMapperClass(this.getClass()); job.setInputFormat(TextInputFormat.class); job.setOutputPath(segment); job.setOutputFormat(WaxFetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(FetcherOutput.class); // Pass the collection name out to the tasks IF non-null. if ((collection != null) && (collection.length() > 0)) { job.set(ImportArcs.WAX_SUFFIX + ImportArcs.ARCCOLLECTION_KEY, collection); } job.setJobName("import " + arcUrlsDir + " " + segment); JobClient.runJob(job); LOG.info("ImportArcs: done"); }
From source file:org.commoncrawl.hadoop.io.S3GetMetdataJob.java
License:Open Source License
public static void main(String[] args) { String accessKey = args[0];/* ww w. jav a 2 s. c o m*/ String secretKey = args[1]; String paths[] = { // "2008/06", // "2008/07", // "2008/08", // "2008/09", // "2008/10", // "2008/11", "2009" }; for (int pathIndex = 0; pathIndex < paths.length; ++pathIndex) { LOG.info("Processing Path:" + paths[pathIndex]); JobConf job = new JobConf(S3GetMetdataJob.class); Path tempDir = new Path( job.get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); LOG.info("Output for Path:" + paths[pathIndex] + " is:" + tempDir); System.out.println("Output Path is:" + tempDir); job.setJobName("S3 To CrawlURLMetadata Job for Path:" + paths[pathIndex]); // setup s3 properties JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); ARCSplitCalculator.setFilesPerSplit(job, 25); // set up arc reader properties ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, paths[pathIndex]); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // and set up input format ... job.setInputFormat(ARCInputFormat.class); // set mapper ... job.setMapRunnerClass(S3GetMetdataJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(CrawlURLMetadata.class); // reduce output types job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlURLMetadata.class); // double the number of reducers ... // job.setNumReduceTasks(job.getNumReduceTasks() * 2); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); Path finalPath = new Path("jobout/" + paths[pathIndex] + "/result"); LOG.info("Copying Job Output to:" + finalPath); FileSystem fs = FileSystem.get(job); try { fs.mkdirs(finalPath.getParent()); fs.rename(tempDir, finalPath); LOG.info("Copied Job Output to:" + finalPath); } finally { // fs.close(); } } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } } }
From source file:org.commoncrawl.hadoop.template.SampleHadoopJob.java
License:Open Source License
/** * main routine/* w ww . j ava 2 s .c o m*/ * * @param args */ public static void main(String[] args) { // amazon access key - passed on command line String accessKey = args[0]; // amazon secret key - passed on command line String secretKey = args[1]; // regular expression to match against - passed in command line String regEx = args[2]; // group number to extract int groupNumber = Integer.parseInt(args[3]); /** arc files names start with year then month **/ // we want to process all files uploaded in 2009 // so, we will use the prefix string "2009", // buy you could, for example pass in a more restrictive // pattern such as "2008/06". String inputPrefix = "2009"; LOG.info("Processing Path:" + inputPrefix); // allocate job config JobConf job = new JobConf(SampleHadoopJob.class); // set job name job.setJobName("Sample RegEx Job against path:" + inputPrefix); // set regular expression attributes job.set("mapred.mapper.regex", regEx); job.setInt("mapred.mapper.regex.group", groupNumber); // create temp file pth Path tempDir = new Path(job.get("mapred.temp.dir", ".") + "/temp-" + System.currentTimeMillis()); LOG.info("Output for job " + job.getJobName() + " is:" + tempDir); // we are going to be using the JetS3ARCSource as an input source to // the ArcInputFormat. This input source uses the multi-threaded jets3 // library to request data from S3. /** setup s3 properties **/ // set the number of retries per ARC file. // we are setting this number to one, so if an IOException // occurs when processing an ARCFile, we are going to silently skip it // and continue processing the next ARC file. You should set this to be // a number LESS than mapred.max.tracker.failures (as defined in your // job config or hadoop-site.xml). Otherwise, your entire job could // fail if it encounteres a bad ARC file in the bucket, or if the S3 serivce // exhibits a failure condition specific to a single key or set of keys. JetS3tARCSource.setMaxRetries(job, 1); // set up S3 credentials ... JetS3tARCSource.setAWSAccessKeyID(job, accessKey); JetS3tARCSource.setAWSSecretAccessKey(job, secretKey); // set the number of files per split // set this number higher if the bucket contains lots of files, to reduce // the burden on the map-reduce system from tracking too many file splits. ARCSplitCalculator.setFilesPerSplit(job, 25); /** set up arc reader properties **/ // again, set the timeout to something reasonable, so that your entire job // will not hang if a single GET request fails to complete in a reasonable // amount of time ArcFileReader.setIOTimeoutValue(30000); // set input prefixes ... JetS3tARCSource.setInputPrefixes(job, inputPrefix); // and S3 bucket name ... JetS3tARCSource.setBucketName(job, "commoncrawl"); // and setup arc source for ArcInputFormat ARCInputFormat.setARCSourceClass(job, JetS3tARCSource.class); // now inform the job that it needs to use the ARCInputFormat job.setInputFormat(ARCInputFormat.class); // set up our map runner class // we use a map runner instead of a mapper here to give us an extra level of // control over how we handle errors. When running a large job against // the crawl corpus which may contain hunders of thousands of ARC files, it // is extremely important to reduce the risks of abnormal job termination. job.setMapRunnerClass(SampleHadoopJob.class); // setup reducer (identity in this case ... ) job.setReducerClass(IdentityReducer.class); // standard output format ... job.setOutputFormat(SequenceFileOutputFormat.class); // set output path job.setOutputPath(tempDir); // map output types job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(Text.class); // run the job ... try { LOG.info("Starting Job:" + job.getJobName()); JobClient.runJob(job); LOG.info("Finished Job:" + job.getJobName()); } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); e.printStackTrace(); } }
From source file:org.pentaho.hadoop.mapreduce.test.MapperAndReducerTest.java
License:Open Source License
public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile, String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort) throws IOException, KettleException { JobConf conf = new JobConf(); conf.setJobName("wordcount"); KettleEnvironment.init();// w ww . ja v a 2 s .c om // Register Map/Reduce Input and Map/Reduce Output plugin steps PluginMainClassType mainClassTypesAnnotation = StepPluginType.class .getAnnotation(PluginMainClassType.class); Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>(); inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName()); PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input", "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin); Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>(); outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName()); PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output", "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin); TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta transMeta = null; TransConfiguration transConfig = null; if (mapperTransformationFile != null) { conf.setMapRunnerClass(PentahoMapRunnable.class); transMeta = new TransMeta(mapperTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-map-xml", transConfig.getXML()); conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); } if (combinerTransformationFile != null) { conf.setCombinerClass(GenericTransCombiner.class); transMeta = new TransMeta(combinerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-combiner-xml", transConfig.getXML()); conf.set("transformation-combiner-input-stepname", "Injector"); conf.set("transformation-combiner-output-stepname", "Output"); } if (reducerTransformationFile != null) { conf.setReducerClass((Class<? extends Reducer>) GenericTransReduce.class); transMeta = new TransMeta(reducerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-reduce-xml", transConfig.getXML()); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); } conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path("/")); FileOutputFormat.setOutputPath(conf, new Path("/")); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJar(jar.toURI().toURL().toExternalForm()); conf.setWorkingDirectory(new Path("/tmp/wordcount")); return conf; }
From source file:org.pentaho.hadoop.mapreduce.test.PentahoMapReduceIntegrationTest.java
License:Apache License
public static JobConf createJobConf(String mapperTransformationFile, String combinerTransformationFile, String reducerTransformationFile, String hostname, String hdfsPort, String trackerPort) throws IOException, KettleException { JobConf conf = new JobConf(); conf.setJobName("wordcount"); KettleEnvironment.init();/* w w w . j ava2 s .c om*/ // Register Map/Reduce Input and Map/Reduce Output plugin steps PluginMainClassType mainClassTypesAnnotation = StepPluginType.class .getAnnotation(PluginMainClassType.class); Map<Class<?>, String> inputClassMap = new HashMap<Class<?>, String>(); inputClassMap.put(mainClassTypesAnnotation.value(), HadoopEnterMeta.class.getName()); PluginInterface inputStepPlugin = new Plugin(new String[] { "HadoopEnterPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Input", "Enter a Hadoop Mapper or Reducer transformation", "MRI.png", false, false, inputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, inputStepPlugin); Map<Class<?>, String> outputClassMap = new HashMap<Class<?>, String>(); outputClassMap.put(mainClassTypesAnnotation.value(), HadoopExitMeta.class.getName()); PluginInterface outputStepPlugin = new Plugin(new String[] { "HadoopExitPlugin" }, StepPluginType.class, mainClassTypesAnnotation.value(), "Hadoop", "MapReduce Output", "Exit a Hadoop Mapper or Reducer transformation", "MRO.png", false, false, outputClassMap, new ArrayList<String>(), null, null); PluginRegistry.getInstance().registerPlugin(StepPluginType.class, outputStepPlugin); TransExecutionConfiguration transExecConfig = new TransExecutionConfiguration(); TransMeta transMeta = null; TransConfiguration transConfig = null; if (mapperTransformationFile != null) { conf.setMapRunnerClass(PentahoMapRunnable.class); transMeta = new TransMeta(mapperTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-map-xml", transConfig.getXML()); conf.set("transformation-map-input-stepname", "Injector"); conf.set("transformation-map-output-stepname", "Output"); } if (combinerTransformationFile != null) { conf.setCombinerClass(GenericTransCombiner.class); transMeta = new TransMeta(combinerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-combiner-xml", transConfig.getXML()); conf.set("transformation-combiner-input-stepname", "Injector"); conf.set("transformation-combiner-output-stepname", "Output"); } if (reducerTransformationFile != null) { conf.setReducerClass(GenericTransReduce.class); transMeta = new TransMeta(reducerTransformationFile); transConfig = new TransConfiguration(transMeta, transExecConfig); conf.set("transformation-reduce-xml", transConfig.getXML()); conf.set("transformation-reduce-input-stepname", "Injector"); conf.set("transformation-reduce-output-stepname", "Output"); } conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); File jar = new File("./dist/pentaho-big-data-plugin-TRUNK-SNAPSHOT.jar"); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); FileInputFormat.setInputPaths(conf, new Path("/")); FileOutputFormat.setOutputPath(conf, new Path("/")); conf.set("fs.default.name", "hdfs://" + hostname + ":" + hdfsPort); conf.set("mapred.job.tracker", hostname + ":" + trackerPort); conf.setJar(jar.toURI().toURL().toExternalForm()); conf.setWorkingDirectory(new Path("/tmp/wordcount")); return conf; }
From source file:sa.edu.kaust.fwindex.BuildIntDocVectorsForwardIndex.java
License:Apache License
/** * Runs this tool./*from w ww. ja v a 2 s. c om*/ */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } String inPath = args[0]; String outPath = args[1]; JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = 10; sLogger.info("Tool: BuildIntDocVectorsIndex"); String intDocVectorsPath = inPath; String forwardIndexPath = outPath; if (!fs.exists(new Path(intDocVectorsPath))) { sLogger.info("Error: IntDocVectors don't exist!"); return 0; } if (fs.exists(new Path(forwardIndexPath))) { sLogger.info("IntDocVectorsForwardIndex already exists: skipping!"); return 0; } conf.set("ForwardIndexPath", forwardIndexPath); conf.setJobName("BuildIntDocVectorsForwardIndex"); Path inputPath = new Path(intDocVectorsPath); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(TermDF.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:sa.edu.kaust.twitter.index.BuildPostingsForwardIndex.java
License:Apache License
/** * Runs this tool./*ww w . j av a2s .c o m*/ */ public int run(String[] args) throws Exception { if (args.length != 2) { printUsage(); return -1; } JobConf conf = new JobConf(BuildPostingsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = 10; sLogger.info("Tool: PostingsForwardIndex"); String postingsPath = args[0]; String forwardIndexPath = args[1]; if (!fs.exists(new Path(postingsPath))) { sLogger.info("Error: IntDocVectors don't exist!"); return 0; } // delete the output directory if it exists already //FileSystem.get(conf).delete(new Path(forwardIndexPath), true); if (fs.exists(new Path(forwardIndexPath))) { sLogger.info("PostingsForwardIndex already exists: skipping!"); return 0; } conf.set("ForwardIndexPath", forwardIndexPath); conf.setJobName("BuildPostingsForwardIndex"); Path inputPath = new Path(postingsPath); FileInputFormat.setInputPaths(conf, inputPath); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }