List of usage examples for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass
@SuppressWarnings("unchecked") public static void setOutputFormatClass(Job job, Class<? extends OutputFormat> theClass)
From source file:org.apache.kylin.engine.mr.steps.ExtractDictionaryFromGlobalJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); try {/* ww w .j a va 2 s . c om*/ options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBING_JOB_ID); options.addOption(OPTION_OUTPUT_PATH); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String job_id = getOptionValue(OPTION_CUBING_JOB_ID); job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id); String cubeName = getOptionValue(OPTION_CUBE_NAME); String segmentID = getOptionValue(OPTION_SEGMENT_ID); // ---------------------------------------------------------------------------- // add metadata to distributed cache CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); CubeSegment segment = cube.getSegmentById(segmentID); job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); logger.info("Starting: " + job.getJobName()); job.getConfiguration().set("mapreduce.map.speculative", "false"); setJobClasspath(job, cube.getConfig()); // Mapper job.setMapperClass(ExtractDictionaryFromGlobalMapper.class); // Reducer job.setNumReduceTasks(0); // Input IMRInput.IMRTableInputFormat flatTableInputFormat = MRUtil.getBatchCubingInputSide(segment) .getFlatTableInputFormat(); flatTableInputFormat.configureJob(job); // Output //// prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); FileOutputFormat.setOutputPath(job, output); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); deletePath(job.getConfiguration(), output); attachSegmentMetadataWithDict(segment, job.getConfiguration()); return waitForCompletion(job); } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsJob.java
License:Apache License
private void setupReducer(Path output, int numberOfReducers) throws IOException { job.setReducerClass(FactDistinctColumnsReducer.class); job.setPartitionerClass(FactDistinctColumnPartitioner.class); job.setNumReduceTasks(numberOfReducers); //make each reducer output to respective dir MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, BytesWritable.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class, NullWritable.class, LongWritable.class); FileOutputFormat.setOutputPath(job, output); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString()); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); deletePath(job.getConfiguration(), output); }
From source file:org.apache.kylin.engine.mr.steps.FilterRecommendCuboidDataJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); try {// w w w . java2 s . c o m options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); options.addOption(OPTION_INPUT_PATH); options.addOption(OPTION_OUTPUT_PATH); parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT); String segmentID = getOptionValue(OPTION_SEGMENT_ID); Path input = new Path(getOptionValue(OPTION_INPUT_PATH)); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); CubeSegment optSegment = cube.getSegmentById(segmentID); CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment); logger.info("Starting: " + job.getJobName()); setJobClasspath(job, cube.getConfig()); // Mapper job.setMapperClass(FilterRecommendCuboidDataMapper.class); // Reducer job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Input job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, input); // Output //// prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, output); // set job configuration job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); // add metadata to distributed cache attachSegmentMetadataWithDict(originalSegment, job.getConfiguration()); this.deletePath(job.getConfiguration(), output); return waitForCompletion(job); } catch (Exception e) { logger.error("error in CuboidJob", e); printUsage(options); throw e; } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
From source file:org.apache.kylin.engine.mr.steps.UHCDictionaryJob.java
License:Apache License
private void setupReducer(Path output, int numberOfReducers) throws IOException { job.setReducerClass(UHCDictionaryReducer.class); job.setPartitionerClass(UHCDictionaryPartitioner.class); job.setNumReduceTasks(numberOfReducers); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); FileOutputFormat.setOutputPath(job, output); job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString()); //prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); deletePath(job.getConfiguration(), output); }
From source file:org.apache.kylin.engine.mr.steps.UpdateOldCuboidShardJob.java
License:Apache License
@Override public int run(String[] args) throws Exception { Options options = new Options(); try {//from w w w. j a v a2 s .c om options.addOption(OPTION_JOB_NAME); options.addOption(OPTION_CUBE_NAME); options.addOption(OPTION_SEGMENT_ID); options.addOption(OPTION_INPUT_PATH); options.addOption(OPTION_OUTPUT_PATH); parseOptions(options, args); job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME)); String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT); String segmentID = getOptionValue(OPTION_SEGMENT_ID); Path input = new Path(getOptionValue(OPTION_INPUT_PATH)); Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH)); CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv()); CubeInstance cube = cubeMgr.getCube(cubeName); CubeSegment optSegment = cube.getSegmentById(segmentID); CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment); logger.info("Starting: " + job.getJobName()); setJobClasspath(job, cube.getConfig()); // Mapper job.setMapperClass(UpdateOldCuboidShardMapper.class); // Reducer job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); // Input job.setInputFormatClass(SequenceFileInputFormat.class); FileInputFormat.setInputPaths(job, input); // Output //// prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); FileOutputFormat.setOutputPath(job, output); // set job configuration job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName); job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID); // add metadata to distributed cache attachSegmentsMetadataWithDict(Lists.newArrayList(optSegment, originalSegment), job.getConfiguration()); this.deletePath(job.getConfiguration(), output); return waitForCompletion(job); } catch (Exception e) { logger.error("error in CuboidJob", e); printUsage(options); throw e; } finally { if (job != null) cleanupTempConfFile(job.getConfiguration()); } }
From source file:org.apache.kylin.engine.spark.SparkFactDistinct.java
License:Apache License
@Override protected void execute(OptionsHelper optionsHelper) throws Exception { String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME); String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL); String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID); String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE); String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH); String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH); String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH); int samplingPercent = Integer.parseInt(optionsHelper.getOptionValue(OPTION_STATS_SAMPLING_PERCENT)); Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"), Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey") }; SparkConf conf = new SparkConf() .setAppName("Fact distinct columns for:" + cubeName + " segment " + segmentId); //serialization conf conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator"); conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray); KylinSparkJobListener jobListener = new KylinSparkJobListener(); try (JavaSparkContext sc = new JavaSparkContext(conf)) { sc.sc().addSparkListener(jobListener); HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath)); final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration()); KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl); final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName); final Job job = Job.getInstance(sConf.get()); final FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping( cubeInstance);/*from ww w .ja v a 2 s .c o m*/ logger.info("RDD Output path: {}", outputPath); logger.info("getTotalReducerNum: {}", reducerMapping.getTotalReducerNum()); logger.info("getCuboidRowCounterReducerNum: {}", reducerMapping.getCuboidRowCounterReducerNum()); logger.info("counter path {}", counterPath); boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE .equalsIgnoreCase(envConfig.getFlatTableStorageFormat()); // calculate source record bytes size final LongAccumulator bytesWritten = sc.sc().longAccumulator(); final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(isSequenceFile, sc, inputPath, hiveTable); JavaPairRDD<SelfDefineSortableKey, Text> flatOutputRDD = recordRDD.mapPartitionsToPair( new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten)); JavaPairRDD<SelfDefineSortableKey, Iterable<Text>> aggredRDD = flatOutputRDD.groupByKey( new FactDistinctPartitioner(cubeName, metaUrl, sConf, reducerMapping.getTotalReducerNum())); JavaPairRDD<String, Tuple3<Writable, Writable, String>> outputRDD = aggredRDD .mapPartitionsToPair(new MultiOutputFunction(cubeName, metaUrl, sConf, samplingPercent)); // make each reducer output to respective dir MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class, NullWritable.class, ArrayPrimitiveWritable.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class); MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class, NullWritable.class, LongWritable.class); FileOutputFormat.setOutputPath(job, new Path(outputPath)); FileOutputFormat.setCompressOutput(job, false); // prevent to create zero-sized default output LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(outputRDD); multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration()); long recordCount = recordRDD.count(); logger.info("Map input records={}", recordCount); logger.info("HDFS Read: {} HDFS Write", bytesWritten.value()); Map<String, String> counterMap = Maps.newHashMap(); counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(recordCount)); counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value())); // save counter to hdfs HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap); HadoopUtil.deleteHDFSMeta(metaUrl); } }
From source file:org.apache.rya.accumulo.mr.merge.CopyTool.java
License:Apache License
@Override protected void setupAccumuloOutput(final Job job, final String outputTable) throws AccumuloSecurityException { AccumuloOutputFormat.setConnectorInfo(job, childUserName, new PasswordToken(childPwd)); AccumuloOutputFormat.setCreateTables(job, true); AccumuloOutputFormat.setDefaultTableName(job, outputTable); if (!childMock) { AccumuloOutputFormat.setZooKeeperInstance(job, new ClientConfiguration().withInstance(childInstance).withZkHosts(childZk)); } else {// w ww . j av a2 s. c o m AccumuloOutputFormat.setMockInstance(job, childInstance); } if (useCopyFileOutput) { log.info("Using file output format mode."); if (StringUtils.isNotBlank(baseOutputDir)) { Path baseOutputPath; Path filesOutputPath; if (StringUtils.isNotBlank(outputTable)) { filesOutputPath = getPath(baseOutputDir, outputTable, "files"); baseOutputPath = filesOutputPath.getParent(); job.setOutputFormatClass(AccumuloFileOutputFormat.class); } else { // If table name is not given, configure output for one level higher: // it's up to the job to handle subdirectories. Make sure the parent // exists. filesOutputPath = getPath(baseOutputDir); baseOutputPath = filesOutputPath; LazyOutputFormat.setOutputFormatClass(job, AccumuloFileOutputFormat.class); MultipleOutputs.setCountersEnabled(job, true); } log.info("File output destination: " + filesOutputPath); if (useCopyFileOutputDirectoryClear) { try { clearOutputDir(baseOutputPath); } catch (final IOException e) { log.error("Error clearing out output path.", e); } } try { final FileSystem fs = FileSystem.get(conf); fs.mkdirs(filesOutputPath.getParent()); fs.setPermission(filesOutputPath.getParent(), new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL)); } catch (final IOException e) { log.error("Failed to set permission for output path.", e); } FileOutputFormat.setOutputPath(job, filesOutputPath); if (StringUtils.isNotBlank(compressionType)) { if (isValidCompressionType(compressionType)) { log.info("File compression type: " + compressionType); AccumuloFileOutputFormat.setCompressionType(job, compressionType); } else { log.warn("Invalid compression type: " + compressionType); } } } } else { log.info("Using accumulo output format mode."); job.setOutputFormatClass(AccumuloOutputFormat.class); } }
From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java
License:Apache License
/** * Set up the MapReduce job to output a schema (TBox). *//* ww w . ja v a2s. c o m*/ protected void configureSchemaOutput() { Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration()); SequenceFileOutputFormat.setOutputPath(job, outPath); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(SchemaWritable.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputs.addNamedOutput(job, "schemaobj", SequenceFileOutputFormat.class, NullWritable.class, SchemaWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.setCountersEnabled(job, true); }
From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java
License:Apache License
/** * Set up a MapReduce job to output newly derived triples. * @param intermediate True if this is intermediate data. Outputs * to [base]-[iteration]-[temp]. *//*from www . j av a 2s . c om*/ protected void configureDerivationOutput(boolean intermediate) { Path outPath; Configuration conf = job.getConfiguration(); int iteration = MRReasoningUtils.getCurrentIteration(conf); if (intermediate) { outPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + iteration + MRReasoningUtils.TEMP_SUFFIX); } else { outPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + iteration); } SequenceFileOutputFormat.setOutputPath(job, outPath); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT, SequenceFileOutputFormat.class, Fact.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT, SequenceFileOutputFormat.class, Derivation.class, NullWritable.class); MultipleOutputs.setCountersEnabled(job, true); // Set up an output for diagnostic info, if needed MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); }
From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java
License:Apache License
/** * Set up a MapReduce job to output human-readable text. */// ww w. j a va2s .c o m protected void configureTextOutput(String destination) { Path outPath; outPath = MRReasoningUtils.getOutputPath(job.getConfiguration(), destination); TextOutputFormat.setOutputPath(job, outPath); LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT, TextOutputFormat.class, NullWritable.class, Text.class); MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.setCountersEnabled(job, true); }