Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.linkedin.hadoop.example.WordCountCounters.java

License:Apache License

/**
 * Azkaban will look for a method named `run` to start your job. Use this method to setup all the
 * Hadoop-related configuration for your job and submit it.
 *
 * @throws Exception If there is an exception during the configuration or submission of your job
 *///  ww  w .j  a v  a 2 s . co  m
public void run() throws Exception {
    _logger.info(String.format("Configuring job for the class %s", getClass().getSimpleName()));

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WordCountJob.class);
    job.setJobName(_name);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountCombiner.class);
    job.setReducerClass(WordCountReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    String inputPath = _properties.getProperty("input.path");
    String outputPath = _properties.getProperty("output.path");
    boolean forceOverwrite = Boolean.parseBoolean(_properties.getProperty("force.output.overwrite", "false"));

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    // Before we submit the job, remove the old the output directory
    if (forceOverwrite) {
        FileSystem fs = FileSystem.get(job.getConfiguration());
        fs.delete(FileOutputFormat.getOutputPath(job), true);
    }

    // Since we have Kerberos enabled at LinkedIn, we must add the token to our configuration. If
    // you don't use Kerberos security for your Hadoop cluster, you don't need this code.
    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    // Submit the job for execution
    _logger.info(String.format("About to submit the job named %s", _name));
    boolean succeeded = job.waitForCompletion(true);

    // Before we return, display our custom counters for the job in the Azkaban logs
    long inputWords = job.getCounters().findCounter(WordCountCounters.INPUT_WORDS).getValue();
    _logger.info(String.format("Read a total of %d input words", inputWords));

    // Azkaban will not realize the Hadoop job failed unless you specifically throw an exception
    if (!succeeded) {
        throw new Exception(String.format("Azkaban job %s failed", _name));
    }
}

From source file:com.linkedin.oneclick.wordcount.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    Job job = new Job(conf, "Word Count");
    job.setJarByClass(WordCount.class);

    String workDirectory = args.length >= 1 ? args[0] : "wordcount";
    Path input = new Path(workDirectory, "input.txt");
    FileSystem fs = input.getFileSystem(conf);
    fs.mkdirs(input.getParent());//from  w w  w.j ava2s. co m
    copy(resourceInputStream(getClass().getResource("/onegin.txt")), createOutputStream(conf, input), conf);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(WordCountMapper.class);
    FileInputFormat.addInputPath(job, input);

    job.setCombinerClass(WordCountReducer.class);
    job.setReducerClass(WordCountReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);
    Path output = clean(conf, new Path(workDirectory, "wordcount"));
    FileOutputFormat.setOutputPath(job, output);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

public void run() throws Exception {
    LOGGER.info("Starting {}", getClass().getSimpleName());

    FileSystem fs = FileSystem.get(getConf());
    Path inputPathPattern = new Path(_inputSegmentDir);

    if (fs.exists(new Path(_stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(_stagingDir), true);
    }/*from w ww  . j  a  v a  2 s.c o  m*/
    fs.mkdirs(new Path(_stagingDir));
    fs.mkdirs(new Path(_stagingDir + "/input/"));

    if (fs.exists(new Path(_outputDir))) {
        LOGGER.warn("Found the output folder, deleting it");
        fs.delete(new Path(_outputDir), true);
    }
    fs.mkdirs(new Path(_outputDir));

    List<FileStatus> inputDataFiles = new ArrayList<FileStatus>();
    FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern);
    for (FileStatus fileStatus : fileStatusArr) {
        inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath()));
    }

    for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
        FileStatus file = inputDataFiles.get(seqId);
        String completeFilePath = " " + file.getPath().toString() + " " + seqId;
        Path newOutPutFile = new Path((_stagingDir + "/input/"
                + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt"));
        FSDataOutputStream stream = fs.create(newOutPutFile);
        stream.writeUTF(completeFilePath);
        stream.flush();
        stream.close();
    }

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationJob.class);
    job.setJobName(_jobName);

    job.setMapperClass(HadoopSegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema));

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : _properties.keySet()) {
        job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString()));
    }

    if (_depsJarPath != null && _depsJarPath.length() > 0) {
        addDepsJarToDistributedCache(new Path(_depsJarPath), job);
    }

    // Submit the job for execution.
    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", _stagingDir);
    fs.delete(new Path(_stagingDir), true);
}

From source file:com.linkedin.thirdeye.bootstrap.segment.create.SegmentCreationPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationPhaseJob.class);
    job.setJobName(name);// w  w  w .  j a v a2s  .  co  m

    FileSystem fs = FileSystem.get(getConf());

    Configuration configuration = job.getConfiguration();

    String schemaPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEMA_PATH);
    LOGGER.info("Schema path : {}", schemaPath);
    String configPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_CONFIG_PATH);
    LOGGER.info("Config path : {}", configPath);
    Schema dataSchema = createSchema(configPath);
    LOGGER.info("Data schema : {}", dataSchema);
    String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH);
    LOGGER.info("Input path : {}", inputSegmentDir);
    String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputDir);
    String stagingDir = new File(outputDir, TEMP).getAbsolutePath();
    LOGGER.info("Staging dir : {}", stagingDir);
    String tableName = getAndSetConfiguration(configuration, SEGMENT_CREATION_SEGMENT_TABLE_NAME);
    LOGGER.info("Segment table name : {}", tableName);

    // Create temporary directory
    if (fs.exists(new Path(stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(stagingDir), true);
    }
    fs.mkdirs(new Path(stagingDir));
    fs.mkdirs(new Path(stagingDir + "/input/"));

    if (fs.exists(new Path(outputDir))) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(new Path(outputDir), true);
    }
    fs.mkdirs(new Path(outputDir));

    Path inputPathPattern = new Path(inputSegmentDir);
    List<FileStatus> inputDataFiles = Arrays.asList(fs.listStatus(inputPathPattern));
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((stagingDir + "/input/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", OBJECT_MAPPER.writeValueAsString(dataSchema));
    if (!fs.exists(new Path(schemaPath))) {
        OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValue(fs.create(new Path(schemaPath), false),
                dataSchema);
    }

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", stagingDir);
    fs.delete(new Path(stagingDir), true);

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob.java

License:Apache License

public Job run() throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName(name);/*www. j a va  2 s. c o  m*/
    job.setJarByClass(AggregationPhaseJob.class);

    FileSystem fs = FileSystem.get(getConf());
    Configuration configuration = job.getConfiguration();

    // Properties
    LOGGER.info("Properties {}", props);

    // Input Path
    String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH);
    LOGGER.info("Input path dir: " + inputPathDir);
    for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
        LOGGER.info("Adding input:" + inputPath);
        Path input = new Path(inputPath);
        FileInputFormat.addInputPath(job, input);
    }

    // Output path
    Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH));
    LOGGER.info("Output path dir: " + outputPath.toString());
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // Schema
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
    LOGGER.info("Schema : {}", avroSchema.toString(true));
    job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString());

    // ThirdEyeConfig
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
    job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(),
            OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

    // Map config
    job.setMapperClass(AggregationMapper.class);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // Reduce config
    job.setReducerClass(AggregationReducer.class);
    job.setOutputKeyClass(AvroKey.class);
    job.setOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, avroSchema);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName());
    LOGGER.info("Num Reducers : {}", numReducers);
    if (StringUtils.isNotBlank(numReducers)) {
        job.setNumReduceTasks(Integer.valueOf(numReducers));
        LOGGER.info("Setting num reducers {}", job.getNumReduceTasks());
    }

    job.waitForCompletion(true);

    Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS);
    LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
    if (counter.getValue() == 0) {
        throw new IllegalStateException("No input records in " + inputPathDir);
    }
    counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED);
    LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());

    for (String metric : thirdeyeConfig.getMetricNames()) {
        counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric);
        LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
    }

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());
    job.setJarByClass(BackfillPhaseJob.class);
    job.setJobName(name);//from  w w  w .  j av a2 s  . co  m

    FileSystem fs = FileSystem.get(getConf());
    Configuration configuration = job.getConfiguration();

    LOGGER.info("*******************************************************************************");
    String controllerHost = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_HOST);
    String controllerPort = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_PORT);
    LOGGER.info("Controller Host : {} Controller Port : {}", controllerHost, controllerPort);
    String segmentStartTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_START_TIME);
    String segmentEndTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_END_TIME);
    long startTime = Long.valueOf(segmentStartTime);
    long endTime = Long.valueOf(segmentEndTime);
    if (Long.valueOf(segmentStartTime) > Long.valueOf(segmentEndTime)) {
        throw new IllegalStateException("Start time cannot be greater than end time");
    }
    String tableName = getAndSetConfiguration(configuration, BACKFILL_PHASE_TABLE_NAME);
    LOGGER.info("Start time : {} End time : {} Table name : {}", segmentStartTime, segmentEndTime, tableName);

    String outputPath = getAndSetConfiguration(configuration, BACKFILL_PHASE_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputPath);
    Path backfillDir = new Path(outputPath);
    if (fs.exists(backfillDir)) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(backfillDir, true);
    }
    Path downloadDir = new Path(backfillDir, DOWNLOAD);
    LOGGER.info("Creating download dir : {}", downloadDir);
    fs.mkdirs(downloadDir);
    Path inputDir = new Path(backfillDir, INPUT);
    LOGGER.info("Creating input dir : {}", inputDir);
    fs.mkdirs(inputDir);
    Path outputDir = new Path(backfillDir, OUTPUT);
    LOGGER.info("Creating output dir : {}", outputDir);

    BackfillControllerAPIs backfillControllerAPIs = new BackfillControllerAPIs(controllerHost,
            Integer.valueOf(controllerPort), tableName);

    LOGGER.info("Downloading segments in range {} to {}", startTime, endTime);
    List<String> allSegments = backfillControllerAPIs.getAllSegments(tableName);
    List<String> segmentsToDownload = backfillControllerAPIs.findSegmentsInRange(tableName, allSegments,
            startTime, endTime);
    for (String segmentName : segmentsToDownload) {
        backfillControllerAPIs.downloadSegment(segmentName, downloadDir);
    }

    LOGGER.info("Reading downloaded segment input files");
    List<FileStatus> inputDataFiles = new ArrayList<>();
    inputDataFiles.addAll(Lists.newArrayList(fs.listStatus(downloadDir)));
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        LOGGER.info("Creating input files at {} for segment input files", inputDir);
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((inputDir + "/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(BackfillPhaseMapJob.BackfillMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);

    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Cleanup the working directory");
    LOGGER.info("Deleting the dir: {}", downloadDir);
    fs.delete(downloadDir, true);
    LOGGER.info("Deleting the dir: {}", inputDir);
    fs.delete(inputDir, true);
    LOGGER.info("Deleting the dir: {}", outputDir);
    fs.delete(outputDir, true);

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationPhaseJob.class);
    job.setJobName(name);/*from   w w  w  .j  a  va  2s  .c  om*/

    FileSystem fs = FileSystem.get(getConf());

    Configuration configuration = job.getConfiguration();

    String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH);
    LOGGER.info("Input path : {}", inputSegmentDir);
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputSegmentDir);
    LOGGER.info("Schema : {}", avroSchema);
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());
    String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputDir);
    Path stagingDir = new Path(outputDir, TEMP);
    LOGGER.info("Staging dir : {}", stagingDir);
    String segmentWallClockStart = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_START_TIME);
    LOGGER.info("Segment wallclock start time : {}", segmentWallClockStart);
    String segmentWallClockEnd = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_END_TIME);
    LOGGER.info("Segment wallclock end time : {}", segmentWallClockEnd);
    String schedule = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEDULE);
    LOGGER.info("Segment schedule : {}", schedule);
    String isBackfill = props.getProperty(SEGMENT_CREATION_BACKFILL.toString(), DEFAULT_BACKFILL);
    configuration.set(SEGMENT_CREATION_BACKFILL.toString(), isBackfill);
    LOGGER.info("Is Backfill : {}", configuration.get(SEGMENT_CREATION_BACKFILL.toString()));

    // Create temporary directory
    if (fs.exists(stagingDir)) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(stagingDir, true);
    }
    fs.mkdirs(stagingDir);
    fs.mkdirs(new Path(stagingDir + "/input/"));

    // Create output directory
    if (fs.exists(new Path(outputDir))) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(new Path(outputDir), true);
    }
    fs.mkdirs(new Path(outputDir));

    // Read input files
    List<FileStatus> inputDataFiles = new ArrayList<>();
    for (String input : inputSegmentDir.split(",")) {
        Path inputPathPattern = new Path(input);
        inputDataFiles.addAll(Arrays.asList(fs.listStatus(inputPathPattern)));
    }
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((stagingDir + "/input/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set(SEGMENT_CREATION_THIRDEYE_CONFIG.toString(),
            OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", stagingDir);
    fs.delete(stagingDir, true);

    return job;
}

From source file:com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs.java

License:Apache License

private TaskAttemptContext getContext(String nameOutput) throws IOException {

    TaskAttemptContext taskContext = taskContexts.get(nameOutput);

    if (taskContext != null) {
        return taskContext;
    }/*w  ww . j a v  a2  s .c o m*/

    // The following trick leverages the instantiation of a record writer via
    // the job thus supporting arbitrary output formats.
    context.getConfiguration().set("avro.mo.config.namedOutput", nameOutput);
    Job job = new Job(context.getConfiguration());
    job.setOutputFormatClass(getNamedOutputFormatClass(context, nameOutput));
    Schema keySchema = keySchemas.get(nameOutput + "_KEYSCHEMA");
    Schema valSchema = valSchemas.get(nameOutput + "_VALSCHEMA");

    boolean isMaponly = job.getNumReduceTasks() == 0;

    if (keySchema != null) {
        if (isMaponly)
            AvroJob.setMapOutputKeySchema(job, keySchema);
        else
            AvroJob.setOutputKeySchema(job, keySchema);
    }
    if (valSchema != null) {
        if (isMaponly)
            AvroJob.setMapOutputValueSchema(job, valSchema);
        else
            AvroJob.setOutputValueSchema(job, valSchema);
    }
    taskContext = new TaskAttemptContext(job.getConfiguration(), context.getTaskAttemptID());

    taskContexts.put(nameOutput, taskContext);

    return taskContext;
}

From source file:com.luogh.learning.lab.hbase.IndexBuilderTest.java

License:Apache License

/**
 * perf_test_schema:perf_test_table_normal_with_partition_key w_cf value
 * Job configuration.//from   www .j a va 2s. c  om
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    String columnFamily = args[1];
    System.out.println("****" + tableName);
    conf.set(TableInputFormat.INPUT_TABLE, tableName);
    conf.set("index.tablename", tableName);
    conf.set("index.familyname", columnFamily);
    String[] fields = new String[args.length - 2];
    System.arraycopy(args, 2, fields, 0, fields.length);
    conf.setStrings("index.fields", fields);
    Job job = Job.getInstance(conf);
    TableMapReduceUtil.initTableMapperJob(Lists.newArrayList(new Scan()), Map.class,
            ImmutableBytesWritable.class, Put.class, job);
    job.setJarByClass(IndexBuilderTest.class);
    job.setMapperClass(Map.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(TableInputFormat.class);
    job.setOutputFormatClass(MultiTableOutputFormat.class);
    return job;
}

From source file:com.mapr.db.utils.ImportCSV_MR.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args.length != 4) {
        System.out.println("MapR-DB JSON Tables - Import CSV" + "\nUsage:\n"
                + "\tParam 1: JSON Table Path (MapR-FS)\n" + "\tParam 2: Text File Path (Local-FS)\n"
                + "\tParam 3: Text File Delimiter (Local-FS)\n" + "\tParam 4: Schema File Path (Local-FS)\n");

        System.exit(-1);/*  w w w  .j  a  v  a2  s .  c  o m*/
    }

    outputTable = args[0].toString().trim();
    inputDir = args[1].toString().trim();
    delimiter = args[2].toString().trim();
    schemaFile = args[3].toString().trim();

    BasicConfigurator.configure();
    Logger.getRootLogger().setLevel(Level.ERROR);

    ImportCSV_MR imp = new ImportCSV_MR();

    imp.readSchema(schemaFile);
    imp.printSchema();

    Job job = Job.getInstance(conf, "ImportCSV_MR");
    job.setJarByClass(ImportCSV_MR.class);

    job.setMapperClass(MyMapper.class);

    conf = job.getConfiguration();
    conf.setStrings("io.serializations",
            new String[] { conf.get("io.serializations"), JSONDocumentSerialization.class.getName() });

    conf.set("countColumnsInSchema", String.valueOf(countColumnsInSchema));

    conf.set("delimiter", delimiter);

    conf.set("tablePath", outputTable);

    String valueTypes[] = valueTypesInSchema.toArray(new String[valueTypesInSchema.size()]);
    conf.setStrings("valueTypesInSchema", valueTypes);

    String columnNames[] = columnNamesInSchema.toArray(new String[columnNamesInSchema.size()]);
    conf.setStrings("columnNamesInSchema", columnNames);

    //Deciding the appropriate Input format class along with their input path
    FileInputFormat.addInputPath(job, new Path(inputDir));
    job.setInputFormatClass(TextInputFormat.class);

    //Mapper output record key and value class
    job.setMapOutputKeyClass(ByteBufWritableComparable.class);
    job.setMapOutputValueClass(DBDocumentImpl.class);

    //Deciding the appropriate Output format class along with their input path
    conf.set("maprdb.mapred.outputtable", outputTable);
    job.setOutputFormatClass(TableOutputFormat.class);

    //Reducer output record key and value class
    job.setNumReduceTasks(0);

    boolean isJobSuccessful = job.waitForCompletion(true);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
    return 0;
}