List of usage examples for org.apache.hadoop.mapreduce Job setNumReduceTasks
public void setNumReduceTasks(int tasks) throws IllegalStateException
From source file:com.lightboxtechnologies.spectrum.SequenceFileExport.java
License:Apache License
public static void main(String[] args) throws Exception { final Configuration conf = new Configuration(); final String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); String imageID;/*from ww w . j a va2 s .c om*/ String outpath; String friendlyname; final Set<String> exts = new HashSet<String>(); if ("-f".equals(otherArgs[0])) { if (otherArgs.length != 4) { die(); } // load extensions from file final Path extpath = new Path(otherArgs[1]); InputStream in = null; try { in = extpath.getFileSystem(conf).open(extpath); Reader r = null; try { r = new InputStreamReader(in); BufferedReader br = null; try { br = new BufferedReader(r); String line; while ((line = br.readLine()) != null) { exts.add(line.trim().toLowerCase()); } br.close(); } finally { IOUtils.closeQuietly(br); } r.close(); } finally { IOUtils.closeQuietly(r); } in.close(); } finally { IOUtils.closeQuietly(in); } imageID = otherArgs[2]; friendlyname = otherArgs[3]; outpath = otherArgs[4]; } else { if (otherArgs.length < 3) { die(); } // read extensions from trailing args imageID = otherArgs[0]; friendlyname = otherArgs[1]; outpath = otherArgs[2]; // lowercase all file extensions for (int i = 2; i < otherArgs.length; ++i) { exts.add(otherArgs[i].toLowerCase()); } } conf.setStrings("extensions", exts.toArray(new String[exts.size()])); final Job job = SKJobFactory.createJobFromConf(imageID, friendlyname, "SequenceFileExport", conf); job.setJarByClass(SequenceFileExport.class); job.setMapperClass(SequenceFileExportMapper.class); job.setNumReduceTasks(0); job.setOutputKeyClass(BytesWritable.class); job.setOutputValueClass(MapWritable.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); FsEntryHBaseInputFormat.setupJob(job, imageID); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); FileOutputFormat.setOutputPath(job, new Path(outpath)); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java
License:Apache License
public void run() throws Exception { LOGGER.info("Starting {}", getClass().getSimpleName()); FileSystem fs = FileSystem.get(getConf()); Path inputPathPattern = new Path(_inputSegmentDir); if (fs.exists(new Path(_stagingDir))) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(new Path(_stagingDir), true); }/*from w w w. java 2 s. c o m*/ fs.mkdirs(new Path(_stagingDir)); fs.mkdirs(new Path(_stagingDir + "/input/")); if (fs.exists(new Path(_outputDir))) { LOGGER.warn("Found the output folder, deleting it"); fs.delete(new Path(_outputDir), true); } fs.mkdirs(new Path(_outputDir)); List<FileStatus> inputDataFiles = new ArrayList<FileStatus>(); FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern); for (FileStatus fileStatus : fileStatusArr) { inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath())); } for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((_stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationJob.class); job.setJobName(_jobName); job.setMapperClass(HadoopSegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema)); job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : _properties.keySet()) { job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString())); } if (_depsJarPath != null && _depsJarPath.length() > 0) { addDepsJarToDistributedCache(new Path(_depsJarPath), job); } // Submit the job for execution. job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", _stagingDir); fs.delete(new Path(_stagingDir), true); }
From source file:com.linkedin.thirdeye.bootstrap.segment.create.SegmentCreationPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationPhaseJob.class); job.setJobName(name);//from ww w. ja va 2s . c om FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); String schemaPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEMA_PATH); LOGGER.info("Schema path : {}", schemaPath); String configPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_CONFIG_PATH); LOGGER.info("Config path : {}", configPath); Schema dataSchema = createSchema(configPath); LOGGER.info("Data schema : {}", dataSchema); String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH); LOGGER.info("Input path : {}", inputSegmentDir); String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH); LOGGER.info("Output path : {}", outputDir); String stagingDir = new File(outputDir, TEMP).getAbsolutePath(); LOGGER.info("Staging dir : {}", stagingDir); String tableName = getAndSetConfiguration(configuration, SEGMENT_CREATION_SEGMENT_TABLE_NAME); LOGGER.info("Segment table name : {}", tableName); // Create temporary directory if (fs.exists(new Path(stagingDir))) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(new Path(stagingDir), true); } fs.mkdirs(new Path(stagingDir)); fs.mkdirs(new Path(stagingDir + "/input/")); if (fs.exists(new Path(outputDir))) { LOGGER.warn("Found the output folder deleting it"); fs.delete(new Path(outputDir), true); } fs.mkdirs(new Path(outputDir)); Path inputPathPattern = new Path(inputSegmentDir); List<FileStatus> inputDataFiles = Arrays.asList(fs.listStatus(inputPathPattern)); LOGGER.info("size {}", inputDataFiles.size()); try { for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); LOGGER.info("wrote {}", completeFilePath); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } } catch (Exception e) { LOGGER.error("Exception while reading input files ", e); } job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set("data.schema", OBJECT_MAPPER.writeValueAsString(dataSchema)); if (!fs.exists(new Path(schemaPath))) { OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValue(fs.create(new Path(schemaPath), false), dataSchema); } job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : props.keySet()) { job.getConfiguration().set(key.toString(), props.getProperty(key.toString())); } job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", stagingDir); fs.delete(new Path(stagingDir), true); return job; }
From source file:com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name);/*from w w w . ja va 2s .c o m*/ job.setJarByClass(AggregationPhaseJob.class); FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString()); // ThirdEyeConfig String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(AggregationMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Reduce config job.setReducerClass(AggregationReducer.class); job.setOutputKeyClass(AvroKey.class); job.setOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, avroSchema); job.setOutputFormatClass(AvroKeyOutputFormat.class); String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName()); LOGGER.info("Num Reducers : {}", numReducers); if (StringUtils.isNotBlank(numReducers)) { job.setNumReduceTasks(Integer.valueOf(numReducers)); LOGGER.info("Setting num reducers {}", job.getNumReduceTasks()); } job.waitForCompletion(true); Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); if (counter.getValue() == 0) { throw new IllegalStateException("No input records in " + inputPathDir); } counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); for (String metric : thirdeyeConfig.getMetricNames()) { counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric); LOGGER.info(counter.getDisplayName() + " : " + counter.getValue()); } return job; }
From source file:com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(BackfillPhaseJob.class); job.setJobName(name);//from w ww . j a v a 2 s . com FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); LOGGER.info("*******************************************************************************"); String controllerHost = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_HOST); String controllerPort = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_PORT); LOGGER.info("Controller Host : {} Controller Port : {}", controllerHost, controllerPort); String segmentStartTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_START_TIME); String segmentEndTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_END_TIME); long startTime = Long.valueOf(segmentStartTime); long endTime = Long.valueOf(segmentEndTime); if (Long.valueOf(segmentStartTime) > Long.valueOf(segmentEndTime)) { throw new IllegalStateException("Start time cannot be greater than end time"); } String tableName = getAndSetConfiguration(configuration, BACKFILL_PHASE_TABLE_NAME); LOGGER.info("Start time : {} End time : {} Table name : {}", segmentStartTime, segmentEndTime, tableName); String outputPath = getAndSetConfiguration(configuration, BACKFILL_PHASE_OUTPUT_PATH); LOGGER.info("Output path : {}", outputPath); Path backfillDir = new Path(outputPath); if (fs.exists(backfillDir)) { LOGGER.warn("Found the output folder deleting it"); fs.delete(backfillDir, true); } Path downloadDir = new Path(backfillDir, DOWNLOAD); LOGGER.info("Creating download dir : {}", downloadDir); fs.mkdirs(downloadDir); Path inputDir = new Path(backfillDir, INPUT); LOGGER.info("Creating input dir : {}", inputDir); fs.mkdirs(inputDir); Path outputDir = new Path(backfillDir, OUTPUT); LOGGER.info("Creating output dir : {}", outputDir); BackfillControllerAPIs backfillControllerAPIs = new BackfillControllerAPIs(controllerHost, Integer.valueOf(controllerPort), tableName); LOGGER.info("Downloading segments in range {} to {}", startTime, endTime); List<String> allSegments = backfillControllerAPIs.getAllSegments(tableName); List<String> segmentsToDownload = backfillControllerAPIs.findSegmentsInRange(tableName, allSegments, startTime, endTime); for (String segmentName : segmentsToDownload) { backfillControllerAPIs.downloadSegment(segmentName, downloadDir); } LOGGER.info("Reading downloaded segment input files"); List<FileStatus> inputDataFiles = new ArrayList<>(); inputDataFiles.addAll(Lists.newArrayList(fs.listStatus(downloadDir))); LOGGER.info("size {}", inputDataFiles.size()); try { LOGGER.info("Creating input files at {} for segment input files", inputDir); for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((inputDir + "/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); LOGGER.info("wrote {}", completeFilePath); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } } catch (Exception e) { LOGGER.error("Exception while reading input files ", e); } job.setMapperClass(BackfillPhaseMapJob.BackfillMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, inputDir); FileOutputFormat.setOutputPath(job, outputDir); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : props.keySet()) { job.getConfiguration().set(key.toString(), props.getProperty(key.toString())); } job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Cleanup the working directory"); LOGGER.info("Deleting the dir: {}", downloadDir); fs.delete(downloadDir, true); LOGGER.info("Deleting the dir: {}", inputDir); fs.delete(inputDir, true); LOGGER.info("Deleting the dir: {}", outputDir); fs.delete(outputDir, true); return job; }
From source file:com.linkedin.thirdeye.hadoop.derivedcolumn.transformation.DerivedColumnTransformationPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name);/* w w w.ja va 2s.c o m*/ job.setJarByClass(DerivedColumnTransformationPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Input Path String inputPathDir = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(",")) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Topk path String topkPath = getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_TOPK_PATH); LOGGER.info("Topk path : " + topkPath); // Output path Path outputPath = new Path( getAndSetConfiguration(configuration, DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode()); // New schema Schema outputSchema = newSchema(thirdeyeConfig); job.getConfiguration().set(DERIVED_COLUMN_TRANSFORMATION_PHASE_OUTPUT_SCHEMA.toString(), outputSchema.toString()); // Map config job.setMapperClass(DerivedColumnTransformationPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(AvroKey.class); job.setMapOutputValueClass(NullWritable.class); AvroJob.setOutputKeySchema(job, outputSchema); LazyOutputFormat.setOutputFormatClass(job, AvroKeyOutputFormat.class); AvroMultipleOutputs.addNamedOutput(job, "avro", AvroKeyOutputFormat.class, outputSchema); job.setNumReduceTasks(0); job.waitForCompletion(true); return job; }
From source file:com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJarByClass(SegmentCreationPhaseJob.class); job.setJobName(name);/*ww w . j a va2 s . c o m*/ FileSystem fs = FileSystem.get(getConf()); Configuration configuration = job.getConfiguration(); String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH); LOGGER.info("Input path : {}", inputSegmentDir); Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputSegmentDir); LOGGER.info("Schema : {}", avroSchema); String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode()); String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH); LOGGER.info("Output path : {}", outputDir); Path stagingDir = new Path(outputDir, TEMP); LOGGER.info("Staging dir : {}", stagingDir); String segmentWallClockStart = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_START_TIME); LOGGER.info("Segment wallclock start time : {}", segmentWallClockStart); String segmentWallClockEnd = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_END_TIME); LOGGER.info("Segment wallclock end time : {}", segmentWallClockEnd); String schedule = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEDULE); LOGGER.info("Segment schedule : {}", schedule); String isBackfill = props.getProperty(SEGMENT_CREATION_BACKFILL.toString(), DEFAULT_BACKFILL); configuration.set(SEGMENT_CREATION_BACKFILL.toString(), isBackfill); LOGGER.info("Is Backfill : {}", configuration.get(SEGMENT_CREATION_BACKFILL.toString())); // Create temporary directory if (fs.exists(stagingDir)) { LOGGER.warn("Found the temp folder, deleting it"); fs.delete(stagingDir, true); } fs.mkdirs(stagingDir); fs.mkdirs(new Path(stagingDir + "/input/")); // Create output directory if (fs.exists(new Path(outputDir))) { LOGGER.warn("Found the output folder deleting it"); fs.delete(new Path(outputDir), true); } fs.mkdirs(new Path(outputDir)); // Read input files List<FileStatus> inputDataFiles = new ArrayList<>(); for (String input : inputSegmentDir.split(",")) { Path inputPathPattern = new Path(input); inputDataFiles.addAll(Arrays.asList(fs.listStatus(inputPathPattern))); } LOGGER.info("size {}", inputDataFiles.size()); try { for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) { FileStatus file = inputDataFiles.get(seqId); String completeFilePath = " " + file.getPath().toString() + " " + seqId; Path newOutPutFile = new Path((stagingDir + "/input/" + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt")); FSDataOutputStream stream = fs.create(newOutPutFile); LOGGER.info("wrote {}", completeFilePath); stream.writeUTF(completeFilePath); stream.flush(); stream.close(); } } catch (Exception e) { LOGGER.error("Exception while reading input files ", e); } job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class); if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/")); FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/")); job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size()); job.getConfiguration().set(SEGMENT_CREATION_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); job.setMaxReduceAttempts(1); job.setMaxMapAttempts(0); job.setNumReduceTasks(0); for (Object key : props.keySet()) { job.getConfiguration().set(key.toString(), props.getProperty(key.toString())); } job.waitForCompletion(true); if (!job.isSuccessful()) { throw new RuntimeException("Job failed : " + job); } LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir); FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar")); for (FileStatus segment : segmentArr) { fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName())); } // Delete temporary directory. LOGGER.info("Cleanup the working directory."); LOGGER.info("Deleting the dir: {}", stagingDir); fs.delete(stagingDir, true); return job; }
From source file:com.linkedin.thirdeye.hadoop.topk.TopKPhaseJob.java
License:Apache License
public Job run() throws Exception { Job job = Job.getInstance(getConf()); job.setJobName(name);//from www.j a va 2 s .co m job.setJarByClass(TopKPhaseJob.class); Configuration configuration = job.getConfiguration(); FileSystem fs = FileSystem.get(configuration); // Properties LOGGER.info("Properties {}", props); // Input Path String inputPathDir = getAndSetConfiguration(configuration, TOPK_PHASE_INPUT_PATH); LOGGER.info("Input path dir: " + inputPathDir); for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) { LOGGER.info("Adding input:" + inputPath); Path input = new Path(inputPath); FileInputFormat.addInputPath(job, input); } // Output path Path outputPath = new Path(getAndSetConfiguration(configuration, TOPK_PHASE_OUTPUT_PATH)); LOGGER.info("Output path dir: " + outputPath.toString()); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job, outputPath); // Schema Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir); LOGGER.info("Schema : {}", avroSchema.toString(true)); // ThirdEyeConfig String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty( props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()), props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema); props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty); ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props); LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode()); job.getConfiguration().set(TOPK_PHASE_THIRDEYE_CONFIG.toString(), OBJECT_MAPPER.writeValueAsString(thirdeyeConfig)); // Map config job.setMapperClass(TopKPhaseMapper.class); job.setInputFormatClass(AvroKeyInputFormat.class); job.setMapOutputKeyClass(BytesWritable.class); job.setMapOutputValueClass(BytesWritable.class); // Combiner job.setCombinerClass(TopKPhaseCombiner.class); // Reduce config job.setReducerClass(TopKPhaseReducer.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(1); job.waitForCompletion(true); return job; }
From source file:com.luogh.learning.lab.hbase.IndexBuilderTest.java
License:Apache License
/** * perf_test_schema:perf_test_table_normal_with_partition_key w_cf value * Job configuration./*from w w w . ja v a 2s . co m*/ */ public static Job configureJob(Configuration conf, String[] args) throws IOException { String tableName = args[0]; String columnFamily = args[1]; System.out.println("****" + tableName); conf.set(TableInputFormat.INPUT_TABLE, tableName); conf.set("index.tablename", tableName); conf.set("index.familyname", columnFamily); String[] fields = new String[args.length - 2]; System.arraycopy(args, 2, fields, 0, fields.length); conf.setStrings("index.fields", fields); Job job = Job.getInstance(conf); TableMapReduceUtil.initTableMapperJob(Lists.newArrayList(new Scan()), Map.class, ImmutableBytesWritable.class, Put.class, job); job.setJarByClass(IndexBuilderTest.class); job.setMapperClass(Map.class); job.setNumReduceTasks(0); job.setInputFormatClass(TableInputFormat.class); job.setOutputFormatClass(MultiTableOutputFormat.class); return job; }
From source file:com.main.MRSearchMain.java
public void searchHBase(int numOfDays) throws IOException, InterruptedException, ClassNotFoundException { long startTime; long endTime; String path = "/home/hadoop/app/hadoop-2.0.0-cdh4.3.0/etc/hadoop/"; Configuration conf = HBaseConfiguration.create(); // conf.set("hbase.zookeeper.quorum", "streamslab.localdomain"); // conf.set("fs.default.name", "hdfs://streamslab.localdomain:8020"); // conf.set("mapred.job.tracker", "hdfs://streamslab.localdomain:50300"); // conf.set("fs.hdfs.impl", // org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); //?,FileSystem? conf.addResource(new Path(path + "core-site.xml")); conf.addResource(new Path(path + "hdfs-site.xml")); conf.addResource(new Path(path + "mapred-site.xml")); /* /* w ww . j av a2 s .co m*/ * ?map */ conf.set("search.license", "C87310"); conf.set("search.color", "10"); conf.set("search.direction", "2"); Job job = new Job(conf, "MRSearchHBase"); System.out.println("search.license: " + conf.get("search.license")); job.setNumReduceTasks(0); job.setJarByClass(MRSearchMain.class); Scan scan = new Scan(); scan.addFamily(FAMILY_NAME); byte[] startRow = Bytes.toBytes("2011010100000"); byte[] stopRow; switch (numOfDays) { case 1: stopRow = Bytes.toBytes("2011010200000"); break; case 10: stopRow = Bytes.toBytes("2011011100000"); break; case 30: stopRow = Bytes.toBytes("2011020100000"); break; case 365: stopRow = Bytes.toBytes("2012010100000"); break; default: stopRow = Bytes.toBytes("2011010101000"); } // ?key scan.setStartRow(startRow); scan.setStopRow(stopRow); TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, SearchMapper.class, ImmutableBytesWritable.class, Text.class, job); Path outPath = new Path("searchresult"); LOG.info("outPath:" + outPath.toString()); //hdfs FileSystem file = null; try { file = FileSystem.get(conf); } catch (IOException e) { e.printStackTrace(); } // HDFS_File file = new HDFS_File(); // file.DelFile(conf, outPath.getName(), true); // //"hdfs://streamslab.localdomain:8020/ if (file.exists(outPath)) { file.delete(outPath, true); LOG.info("=====delPath " + outPath.toString() + "====="); } FileOutputFormat.setOutputPath(job, outPath);// startTime = System.currentTimeMillis(); job.waitForCompletion(true); endTime = System.currentTimeMillis(); LOG.info("Time used: " + (endTime - startTime)); LOG.info("startRow:" + Text.decode(startRow)); LOG.info("stopRow: " + Text.decode(stopRow)); }