Example usage for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output LazyOutputFormat setOutputFormatClass.

Prototype

@SuppressWarnings("unchecked")
public static void setOutputFormatClass(Job job, Class<? extends OutputFormat> theClass)

Source Link

Document

Set the underlying output format for LazyOutputFormat.

Usage

From source file:org.apache.kylin.engine.mr.steps.ExtractDictionaryFromGlobalJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();

    try {/*  ww w .j  a  va  2 s . c om*/
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBING_JOB_ID);
        options.addOption(OPTION_OUTPUT_PATH);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
        job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);

        String cubeName = getOptionValue(OPTION_CUBE_NAME);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);

        // ----------------------------------------------------------------------------
        // add metadata to distributed cache
        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment segment = cube.getSegmentById(segmentID);

        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);

        logger.info("Starting: " + job.getJobName());

        job.getConfiguration().set("mapreduce.map.speculative", "false");
        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(ExtractDictionaryFromGlobalMapper.class);

        // Reducer
        job.setNumReduceTasks(0);

        // Input
        IMRInput.IMRTableInputFormat flatTableInputFormat = MRUtil.getBatchCubingInputSide(segment)
                .getFlatTableInputFormat();
        flatTableInputFormat.configureJob(job);
        // Output
        //// prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
        FileOutputFormat.setOutputPath(job, output);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);
        deletePath(job.getConfiguration(), output);

        attachSegmentMetadataWithDict(segment, job.getConfiguration());
        return waitForCompletion(job);
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

From source file:org.apache.kylin.engine.mr.steps.FactDistinctColumnsJob.java

License:Apache License

private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(FactDistinctColumnsReducer.class);
    job.setPartitionerClass(FactDistinctColumnPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    //make each reducer output to respective dir
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class,
            NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
            NullWritable.class, BytesWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS, SequenceFileOutputFormat.class,
            LongWritable.class, BytesWritable.class);
    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class,
            NullWritable.class, LongWritable.class);

    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}

From source file:org.apache.kylin.engine.mr.steps.FilterRecommendCuboidDataJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    try {//  w  w  w  .  java2 s . c o  m
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);
        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment optSegment = cube.getSegmentById(segmentID);
        CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment);

        logger.info("Starting: " + job.getJobName());

        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(FilterRecommendCuboidDataMapper.class);

        // Reducer
        job.setNumReduceTasks(0);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // Input
        job.setInputFormatClass(SequenceFileInputFormat.class);
        FileInputFormat.setInputPaths(job, input);
        // Output
        //// prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
        FileOutputFormat.setOutputPath(job, output);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
        // add metadata to distributed cache
        attachSegmentMetadataWithDict(originalSegment, job.getConfiguration());

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        logger.error("error in CuboidJob", e);
        printUsage(options);
        throw e;
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

From source file:org.apache.kylin.engine.mr.steps.UHCDictionaryJob.java

License:Apache License

private void setupReducer(Path output, int numberOfReducers) throws IOException {
    job.setReducerClass(UHCDictionaryReducer.class);
    job.setPartitionerClass(UHCDictionaryPartitioner.class);
    job.setNumReduceTasks(numberOfReducers);

    MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
            NullWritable.class, ArrayPrimitiveWritable.class);
    FileOutputFormat.setOutputPath(job, output);
    job.getConfiguration().set(BatchConstants.CFG_OUTPUT_PATH, output.toString());

    //prevent to create zero-sized default output
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    deletePath(job.getConfiguration(), output);
}

From source file:org.apache.kylin.engine.mr.steps.UpdateOldCuboidShardJob.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Options options = new Options();
    try {//from  w w  w. j  a v  a2  s  .c  om
        options.addOption(OPTION_JOB_NAME);
        options.addOption(OPTION_CUBE_NAME);
        options.addOption(OPTION_SEGMENT_ID);
        options.addOption(OPTION_INPUT_PATH);
        options.addOption(OPTION_OUTPUT_PATH);
        parseOptions(options, args);

        job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
        String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
        String segmentID = getOptionValue(OPTION_SEGMENT_ID);
        Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
        Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));

        CubeManager cubeMgr = CubeManager.getInstance(KylinConfig.getInstanceFromEnv());
        CubeInstance cube = cubeMgr.getCube(cubeName);
        CubeSegment optSegment = cube.getSegmentById(segmentID);
        CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment);

        logger.info("Starting: " + job.getJobName());

        setJobClasspath(job, cube.getConfig());

        // Mapper
        job.setMapperClass(UpdateOldCuboidShardMapper.class);

        // Reducer
        job.setNumReduceTasks(0);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // Input
        job.setInputFormatClass(SequenceFileInputFormat.class);
        FileInputFormat.setInputPaths(job, input);
        // Output
        //// prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
        FileOutputFormat.setOutputPath(job, output);

        // set job configuration
        job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
        job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
        // add metadata to distributed cache
        attachSegmentsMetadataWithDict(Lists.newArrayList(optSegment, originalSegment), job.getConfiguration());

        this.deletePath(job.getConfiguration(), output);

        return waitForCompletion(job);
    } catch (Exception e) {
        logger.error("error in CuboidJob", e);
        printUsage(options);
        throw e;
    } finally {
        if (job != null)
            cleanupTempConfFile(job.getConfiguration());
    }
}

From source file:org.apache.kylin.engine.spark.SparkFactDistinct.java

License:Apache License

@Override
protected void execute(OptionsHelper optionsHelper) throws Exception {
    String cubeName = optionsHelper.getOptionValue(OPTION_CUBE_NAME);
    String metaUrl = optionsHelper.getOptionValue(OPTION_META_URL);
    String segmentId = optionsHelper.getOptionValue(OPTION_SEGMENT_ID);
    String hiveTable = optionsHelper.getOptionValue(OPTION_INPUT_TABLE);
    String inputPath = optionsHelper.getOptionValue(OPTION_INPUT_PATH);
    String outputPath = optionsHelper.getOptionValue(OPTION_OUTPUT_PATH);
    String counterPath = optionsHelper.getOptionValue(OPTION_COUNTER_PATH);
    int samplingPercent = Integer.parseInt(optionsHelper.getOptionValue(OPTION_STATS_SAMPLING_PERCENT));

    Class[] kryoClassArray = new Class[] { Class.forName("scala.reflect.ClassTag$$anon$1"),
            Class.forName("org.apache.kylin.engine.mr.steps.SelfDefineSortableKey") };

    SparkConf conf = new SparkConf()
            .setAppName("Fact distinct columns for:" + cubeName + " segment " + segmentId);
    //serialization conf
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    conf.set("spark.kryo.registrator", "org.apache.kylin.engine.spark.KylinKryoRegistrator");
    conf.set("spark.kryo.registrationRequired", "true").registerKryoClasses(kryoClassArray);

    KylinSparkJobListener jobListener = new KylinSparkJobListener();
    try (JavaSparkContext sc = new JavaSparkContext(conf)) {
        sc.sc().addSparkListener(jobListener);
        HadoopUtil.deletePath(sc.hadoopConfiguration(), new Path(outputPath));

        final SerializableConfiguration sConf = new SerializableConfiguration(sc.hadoopConfiguration());
        KylinConfig envConfig = AbstractHadoopJob.loadKylinConfigFromHdfs(sConf, metaUrl);

        final CubeInstance cubeInstance = CubeManager.getInstance(envConfig).getCube(cubeName);

        final Job job = Job.getInstance(sConf.get());

        final FactDistinctColumnsReducerMapping reducerMapping = new FactDistinctColumnsReducerMapping(
                cubeInstance);/*from   ww w  .ja  v  a  2 s .c  o  m*/

        logger.info("RDD Output path: {}", outputPath);
        logger.info("getTotalReducerNum: {}", reducerMapping.getTotalReducerNum());
        logger.info("getCuboidRowCounterReducerNum: {}", reducerMapping.getCuboidRowCounterReducerNum());
        logger.info("counter path {}", counterPath);

        boolean isSequenceFile = JoinedFlatTable.SEQUENCEFILE
                .equalsIgnoreCase(envConfig.getFlatTableStorageFormat());

        // calculate source record bytes size
        final LongAccumulator bytesWritten = sc.sc().longAccumulator();

        final JavaRDD<String[]> recordRDD = SparkUtil.hiveRecordInputRDD(isSequenceFile, sc, inputPath,
                hiveTable);

        JavaPairRDD<SelfDefineSortableKey, Text> flatOutputRDD = recordRDD.mapPartitionsToPair(
                new FlatOutputFucntion(cubeName, segmentId, metaUrl, sConf, samplingPercent, bytesWritten));

        JavaPairRDD<SelfDefineSortableKey, Iterable<Text>> aggredRDD = flatOutputRDD.groupByKey(
                new FactDistinctPartitioner(cubeName, metaUrl, sConf, reducerMapping.getTotalReducerNum()));

        JavaPairRDD<String, Tuple3<Writable, Writable, String>> outputRDD = aggredRDD
                .mapPartitionsToPair(new MultiOutputFunction(cubeName, metaUrl, sConf, samplingPercent));

        // make each reducer output to respective dir
        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_COLUMN, SequenceFileOutputFormat.class,
                NullWritable.class, Text.class);
        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_DICT, SequenceFileOutputFormat.class,
                NullWritable.class, ArrayPrimitiveWritable.class);
        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_STATISTICS,
                SequenceFileOutputFormat.class, LongWritable.class, BytesWritable.class);
        MultipleOutputs.addNamedOutput(job, BatchConstants.CFG_OUTPUT_PARTITION, TextOutputFormat.class,
                NullWritable.class, LongWritable.class);

        FileOutputFormat.setOutputPath(job, new Path(outputPath));
        FileOutputFormat.setCompressOutput(job, false);

        // prevent to create zero-sized default output
        LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

        MultipleOutputsRDD multipleOutputsRDD = MultipleOutputsRDD.rddToMultipleOutputsRDD(outputRDD);

        multipleOutputsRDD.saveAsNewAPIHadoopDatasetWithMultipleOutputs(job.getConfiguration());

        long recordCount = recordRDD.count();
        logger.info("Map input records={}", recordCount);
        logger.info("HDFS Read: {} HDFS Write", bytesWritten.value());

        Map<String, String> counterMap = Maps.newHashMap();
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_COUNT, String.valueOf(recordCount));
        counterMap.put(ExecutableConstants.SOURCE_RECORDS_SIZE, String.valueOf(bytesWritten.value()));

        // save counter to hdfs
        HadoopUtil.writeToSequenceFile(sc.hadoopConfiguration(), counterPath, counterMap);

        HadoopUtil.deleteHDFSMeta(metaUrl);
    }
}

From source file:org.apache.rya.accumulo.mr.merge.CopyTool.java

License:Apache License

@Override
protected void setupAccumuloOutput(final Job job, final String outputTable) throws AccumuloSecurityException {
    AccumuloOutputFormat.setConnectorInfo(job, childUserName, new PasswordToken(childPwd));
    AccumuloOutputFormat.setCreateTables(job, true);
    AccumuloOutputFormat.setDefaultTableName(job, outputTable);
    if (!childMock) {
        AccumuloOutputFormat.setZooKeeperInstance(job,
                new ClientConfiguration().withInstance(childInstance).withZkHosts(childZk));
    } else {//  w ww . j  av  a2 s. c  o  m
        AccumuloOutputFormat.setMockInstance(job, childInstance);
    }
    if (useCopyFileOutput) {
        log.info("Using file output format mode.");
        if (StringUtils.isNotBlank(baseOutputDir)) {
            Path baseOutputPath;
            Path filesOutputPath;
            if (StringUtils.isNotBlank(outputTable)) {
                filesOutputPath = getPath(baseOutputDir, outputTable, "files");
                baseOutputPath = filesOutputPath.getParent();
                job.setOutputFormatClass(AccumuloFileOutputFormat.class);
            } else {
                // If table name is not given, configure output for one level higher:
                // it's up to the job to handle subdirectories. Make sure the parent
                // exists.
                filesOutputPath = getPath(baseOutputDir);
                baseOutputPath = filesOutputPath;
                LazyOutputFormat.setOutputFormatClass(job, AccumuloFileOutputFormat.class);
                MultipleOutputs.setCountersEnabled(job, true);
            }
            log.info("File output destination: " + filesOutputPath);
            if (useCopyFileOutputDirectoryClear) {
                try {
                    clearOutputDir(baseOutputPath);
                } catch (final IOException e) {
                    log.error("Error clearing out output path.", e);
                }
            }
            try {
                final FileSystem fs = FileSystem.get(conf);
                fs.mkdirs(filesOutputPath.getParent());
                fs.setPermission(filesOutputPath.getParent(),
                        new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.ALL));
            } catch (final IOException e) {
                log.error("Failed to set permission for output path.", e);
            }
            FileOutputFormat.setOutputPath(job, filesOutputPath);

            if (StringUtils.isNotBlank(compressionType)) {
                if (isValidCompressionType(compressionType)) {
                    log.info("File compression type: " + compressionType);
                    AccumuloFileOutputFormat.setCompressionType(job, compressionType);
                } else {
                    log.warn("Invalid compression type: " + compressionType);
                }
            }
        }
    } else {
        log.info("Using accumulo output format mode.");
        job.setOutputFormatClass(AccumuloOutputFormat.class);
    }
}

From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java

License:Apache License

/**
 * Set up the MapReduce job to output a schema (TBox).
 *//*  ww  w  .  ja v a2s. c  o  m*/
protected void configureSchemaOutput() {
    Path outPath = MRReasoningUtils.getSchemaPath(job.getConfiguration());
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(SchemaWritable.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, "schemaobj", SequenceFileOutputFormat.class, NullWritable.class,
            SchemaWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class,
            Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}

From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java

License:Apache License

/**
 * Set up a MapReduce job to output newly derived triples.
 * @param   intermediate    True if this is intermediate data. Outputs
 *                          to [base]-[iteration]-[temp].
 *//*from  www . j  av  a  2s .  c om*/
protected void configureDerivationOutput(boolean intermediate) {
    Path outPath;
    Configuration conf = job.getConfiguration();
    int iteration = MRReasoningUtils.getCurrentIteration(conf);
    if (intermediate) {
        outPath = MRReasoningUtils.getOutputPath(conf,
                MRReasoningUtils.OUTPUT_BASE + iteration + MRReasoningUtils.TEMP_SUFFIX);
    } else {
        outPath = MRReasoningUtils.getOutputPath(conf, MRReasoningUtils.OUTPUT_BASE + iteration);
    }
    SequenceFileOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT, SequenceFileOutputFormat.class,
            Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT, SequenceFileOutputFormat.class,
            Fact.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT, SequenceFileOutputFormat.class, Fact.class,
            NullWritable.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT, SequenceFileOutputFormat.class,
            Derivation.class, NullWritable.class);
    MultipleOutputs.setCountersEnabled(job, true);
    // Set up an output for diagnostic info, if needed
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class,
            Text.class);
}

From source file:org.apache.rya.reasoning.mr.AbstractReasoningTool.java

License:Apache License

/**
 * Set up a MapReduce job to output human-readable text.
 */// ww w.  j a va2s .c  o m
protected void configureTextOutput(String destination) {
    Path outPath;
    outPath = MRReasoningUtils.getOutputPath(job.getConfiguration(), destination);
    TextOutputFormat.setOutputPath(job, outPath);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INTERMEDIATE_OUT, TextOutputFormat.class,
            NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.TERMINAL_OUT, TextOutputFormat.class,
            NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.SCHEMA_OUT, TextOutputFormat.class, NullWritable.class,
            Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.INCONSISTENT_OUT, TextOutputFormat.class,
            NullWritable.class, Text.class);
    MultipleOutputs.addNamedOutput(job, MRReasoningUtils.DEBUG_OUT, TextOutputFormat.class, Text.class,
            Text.class);
    MultipleOutputs.setCountersEnabled(job, true);
}