List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:com.kylinolap.job.hadoop.AbstractHadoopJob.java
License:Apache License
public void addInputDirs(String input, Job job) throws IOException { for (String inp : StringSplitter.split(input, ",")) { inp = inp.trim();//from w ww .ja v a 2 s . co m if (inp.endsWith("/*")) { inp = inp.substring(0, inp.length() - 2); FileSystem fs = FileSystem.get(job.getConfiguration()); Path path = new Path(inp); FileStatus[] fileStatuses = fs.listStatus(path); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory()) { hasDir = true; addInputDirs(stat.getPath().toString(), job); } } if (fileStatuses.length > 0 && !hasDir) { addInputDirs(path.toString(), job); } } else { System.out.println("Add input " + inp); FileInputFormat.addInputPath(job, new Path(inp)); } } }
From source file:com.kylinolap.job.hadoop.cube.CuboidJob.java
License:Apache License
protected void setReduceTaskNum(Job job, KylinConfig config, String cubeName, int level) throws ClassNotFoundException, IOException, InterruptedException, JobException { Configuration jobConf = job.getConfiguration(); KylinConfig kylinConfig = KylinConfig.getInstanceFromEnv(); CubeDesc cubeDesc = CubeManager.getInstance(config).getCube(cubeName).getDescriptor(); double perReduceInputMB = kylinConfig.getDefaultHadoopJobReducerInputMB(); double reduceCountRatio = kylinConfig.getDefaultHadoopJobReducerCountRatio(); // total map input MB double totalMapInputMB = this.getTotalMapInputMB(); // output / input ratio int preLevelCuboids, thisLevelCuboids; if (level == 0) { // base cuboid preLevelCuboids = thisLevelCuboids = 1; } else { // n-cuboid int[] allLevelCount = CuboidCLI.calculateAllLevelCount(cubeDesc); preLevelCuboids = allLevelCount[level - 1]; thisLevelCuboids = allLevelCount[level]; }/*from w w w . jav a2 s . c o m*/ // total reduce input MB double totalReduceInputMB = totalMapInputMB * thisLevelCuboids / preLevelCuboids; // number of reduce tasks int numReduceTasks = (int) Math.round(totalReduceInputMB / perReduceInputMB * reduceCountRatio); // adjust reducer number for cube which has DISTINCT_COUNT measures for // better performance if (cubeDesc.hasHolisticCountDistinctMeasures()) { numReduceTasks = numReduceTasks * 4; } // at least 1 reducer numReduceTasks = Math.max(1, numReduceTasks); // no more than 5000 reducer by default numReduceTasks = Math.min(kylinConfig.getHadoopJobMaxReducerNumber(), numReduceTasks); jobConf.setInt(MAPRED_REDUCE_TASKS, numReduceTasks); System.out.println("Having total map input MB " + Math.round(totalMapInputMB)); System.out.println("Having level " + level + ", pre-level cuboids " + preLevelCuboids + ", this level cuboids " + thisLevelCuboids); System.out.println("Having per reduce MB " + perReduceInputMB + ", reduce count ratio " + reduceCountRatio); System.out.println("Setting " + MAPRED_REDUCE_TASKS + "=" + numReduceTasks); }
From source file:com.laizuozuoba.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { // System.setProperty("hadoop.home.dir", "D:\\hadoop-2.2.0"); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: wordcount <in> <out>"); System.exit(2);/*from w w w . j a va 2 s. c om*/ } Job job = new Job(conf, "word count"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); Job job2 = new Job(conf, "uv"); job2.setJarByClass(WordCount.class); job2.setMapperClass(UVMapper.class); job2.setCombinerClass(UVReducer.class); job2.setReducerClass(UVReducer.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job2, new Path(otherArgs[1])); FileOutputFormat.setOutputPath(job2, new Path("hdfs://10.18.106.67:9100/result2")); ControlledJob controlledJob = new ControlledJob(job.getConfiguration()); ControlledJob controlledJob2 = new ControlledJob(job2.getConfiguration()); controlledJob2.addDependingJob(controlledJob); JobControl jc = new JobControl("123"); jc.addJob(controlledJob); jc.addJob(controlledJob2); Thread jcThread = new Thread(jc); jcThread.start(); while (true) { if (jc.allFinished()) { System.out.println(jc.getSuccessfulJobList()); jc.stop(); break; } if (jc.getFailedJobList().size() > 0) { System.out.println(jc.getFailedJobList()); jc.stop(); break; } Thread.sleep(1000); } System.out.println("Finished!!!!!!!!!!!!!!!!!!!!!!!"); }
From source file:com.leon.hadoop.loganalyse.DistributedGrep.java
License:Open Source License
public int run(String[] args) throws Exception { Configuration conf = new Configuration(); GenericOptionsParser parser = new GenericOptionsParser(conf, args); String[] otherArgs = parser.getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: DistributedGrep <regex> <in> <out>"); ToolRunner.printGenericCommandUsage(System.err); System.exit(2);/*from www. j a v a2s .c o m*/ } @SuppressWarnings("deprecation") Job job = new Job(conf, "Distributed Grep"); job.setJarByClass(DistributedGrep.class); job.setMapperClass(GrepMapper.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(Text.class); job.getConfiguration().set(REGEX_KEY, otherArgs[0]); FileInputFormat.addInputPath(job, new Path(otherArgs[1])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[2])); boolean success = job.waitForCompletion(true); return success ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.BlockHasher.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: BlockHasher <imageID> <image> <output>"); return 2; }/*from www .j a v a 2 s .co m*/ final String imageID = args[0]; final String image = args[1]; final String output = args[2]; Configuration conf = getConf(); final Job job = SKJobFactory.createJobFromConf(imageID, image, "BlockHasher", conf); job.setJarByClass(BlockHasher.class); job.setMapperClass(BlockHashMapper.class); // job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); // job ctor copies the Configuration we pass it, get the real one conf = job.getConfiguration(); conf.setLong("timestamp", System.currentTimeMillis()); job.setInputFormatClass(RawFileInputFormat.class); RawFileInputFormat.addInputPath(job, new Path(image)); job.setOutputFormatClass(TextOutputFormat.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(MD5Hash.class); FileOutputFormat.setOutputPath(job, new Path(output)); conf.setInt("mapred.job.reuse.jvm.num.tasks", -1); return job.waitForCompletion(true) ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.ExtractData.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length != 4) { System.err.println("Usage: ExtractData <imageID> <friendly_name> <extents_file> <evidence file>"); return 2; }//from w w w.j a v a 2 s . c o m final String imageID = args[0]; final String friendlyName = args[1]; final String extentsPath = args[2]; final String image = args[3]; Configuration conf = getConf(); final Job job = SKJobFactory.createJobFromConf(imageID, friendlyName, "ExtractData", conf); job.setJarByClass(ExtractData.class); job.setMapperClass(ExtractDataMapper.class); job.setReducerClass(KeyValueSortReducer.class); job.setNumReduceTasks(1); // job ctor copies the Configuration we pass it, get the real one conf = job.getConfiguration(); conf.setLong("timestamp", System.currentTimeMillis()); job.setInputFormatClass(RawFileInputFormat.class); RawFileInputFormat.addInputPath(job, new Path(image)); job.setOutputFormatClass(HFileOutputFormat.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); conf.setInt("mapreduce.job.jvm.numtasks", -1); final FileSystem fs = FileSystem.get(conf); Path hfileDir = new Path("/texaspete/ev/tmp", UUID.randomUUID().toString()); hfileDir = hfileDir.makeQualified(fs); LOG.info("Hashes will be written temporarily to " + hfileDir); HFileOutputFormat.setOutputPath(job, hfileDir); final Path extp = new Path(extentsPath); final URI extents = extp.toUri(); LOG.info("extents file is " + extents); DistributedCache.addCacheFile(extents, conf); conf.set("com.lbt.extentsname", extp.getName()); // job.getConfiguration().setBoolean("mapred.task.profile", true); // job.getConfiguration().setBoolean("mapreduce.task.profile", true); HBaseTables.summon(conf, HBaseTables.HASH_TBL_B, HBaseTables.HASH_COLFAM_B); HBaseTables.summon(conf, HBaseTables.ENTRIES_TBL_B, HBaseTables.ENTRIES_COLFAM_B); final boolean result = job.waitForCompletion(true); if (result) { LoadIncrementalHFiles loader = new LoadIncrementalHFiles(conf); HBaseConfiguration.addHbaseResources(conf); loader.setConf(conf); LOG.info("Loading hashes into hbase"); chmodR(fs, hfileDir); loader.doBulkLoad(hfileDir, new HTable(conf, HBaseTables.HASH_TBL_B)); // result = fs.delete(hfileDir, true); } return result ? 0 : 1; }
From source file:com.lightboxtechnologies.spectrum.FolderCount.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: FolderCount <table> <outpath>"); System.exit(2);//from ww w . j a v a 2 s . c om } final Job job = new Job(conf, "FolderCount"); job.setJarByClass(FolderCount.class); job.setMapperClass(FolderCountMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setNumReduceTasks(1); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); final Scan scan = new Scan(); scan.addFamily(HBaseTables.ENTRIES_COLFAM_B); job.getConfiguration().set(TableInputFormat.INPUT_TABLE, otherArgs[0]); job.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan)); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.lightboxtechnologies.spectrum.FsEntryHBaseInputFormat.java
License:Apache License
static public void setupJob(Job job, String deviceID) throws IOException, DecoderException { Scan scan = new Scan(); scan.addFamily(HBaseTables.ENTRIES_COLFAM_B); /*/*from w w w . j a va 2 s. c om*/ if (deviceID != null && deviceID.length() > 0) { byte[] imgID = Hex.decodeHex(deviceID.toCharArray()); FsEntryRowFilter keyFilt = new FsEntryRowFilter(imgID); RowFilter filt = new RowFilter(CompareFilter.CompareOp.EQUAL, keyFilt); scan.setFilter(filt); } */ HBaseConfiguration.addHbaseResources(job.getConfiguration()); job.getConfiguration().set(TableInputFormat.INPUT_TABLE, HBaseTables.ENTRIES_TBL); job.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan)); job.getConfiguration().set(SKMapper.ID_KEY, deviceID); LOG.info("hbase.zookeeper.quorum:" + job.getConfiguration().get("hbase.zookeeper.quorum")); }
From source file:com.lightboxtechnologies.spectrum.PythonJob.java
License:Apache License
static void configPyTask(Job job, PyEngine py, String task, String script) throws Exception { final Configuration conf = job.getConfiguration(); conf.set("com.lbt.scriptName." + task, script); conf.set("com.lbt.script." + task, Base64.encodeFromFile(script)); Reader in = null;//ww w . ja v a 2 s . c o m try { in = new BufferedReader(new FileReader(script)); py.eval(in, script); in.close(); } finally { IOUtils.closeQuietly(in); } }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
/** * Set input location and obtain input schema. *///from www. j a v a 2 s . c o m @SuppressWarnings("unchecked") @Override public void setLocation(String location, Job job) throws IOException { if (inputAvroSchema != null) { return; } if (!UDFContext.getUDFContext().isFrontend()) { Properties udfProps = getUDFProperties(); String mergedSchema = udfProps.getProperty(AVRO_MERGED_SCHEMA_PROPERTY); if (mergedSchema != null) { HashMap<URI, Map<Integer, Integer>> mergedSchemaMap = (HashMap<URI, Map<Integer, Integer>>) ObjectSerializer .deserialize(mergedSchema); schemaToMergedSchemaMap = new HashMap<Path, Map<Integer, Integer>>(); for (Entry<URI, Map<Integer, Integer>> entry : mergedSchemaMap.entrySet()) { schemaToMergedSchemaMap.put(new Path(entry.getKey()), entry.getValue()); } } String schema = udfProps.getProperty(AVRO_INPUT_SCHEMA_PROPERTY); if (schema != null) { try { inputAvroSchema = new Schema.Parser().parse(schema); return; } catch (Exception e) { // Cases like testMultipleSchemas2 cause exception while deserializing // symbols. In that case, we get it again. LOG.warn("Exception while trying to deserialize schema in backend. " + "Will construct again. schema= " + schema, e); } } } Configuration conf = job.getConfiguration(); Set<Path> paths = AvroStorageUtils.getPaths(location, conf, true); if (!paths.isEmpty()) { // Set top level directories in input format. Adding all files will // bloat configuration size FileInputFormat.setInputPaths(job, paths.toArray(new Path[paths.size()])); // Scan all directories including sub directories for schema if (inputAvroSchema == null) { setInputAvroSchema(paths, conf); } } else { throw new IOException("Input path \'" + location + "\' is not found"); } }