Example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setOutputFormatClass.

Prototype

public void setOutputFormatClass(Class<? extends OutputFormat> cls) throws IllegalStateException

Source Link

Document

Set the OutputFormat for the job.

Usage

From source file:com.linkedin.hadoop.example.WordCountCounters.java

License:Apache License

/**
 * Azkaban will look for a method named `run` to start your job. Use this method to setup all the
 * Hadoop-related configuration for your job and submit it.
 *
 * @throws Exception If there is an exception during the configuration or submission of your job
 *///  ww  w .j  a v  a 2 s . co  m
public void run() throws Exception {
    _logger.info(String.format("Configuring job for the class %s", getClass().getSimpleName()));

    Job job = Job.getInstance(getConf());
    job.setJarByClass(WordCountJob.class);
    job.setJobName(_name);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountCombiner.class);
    job.setReducerClass(WordCountReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    String inputPath = _properties.getProperty("input.path");
    String outputPath = _properties.getProperty("output.path");
    boolean forceOverwrite = Boolean.parseBoolean(_properties.getProperty("force.output.overwrite", "false"));

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    // Before we submit the job, remove the old the output directory
    if (forceOverwrite) {
        FileSystem fs = FileSystem.get(job.getConfiguration());
        fs.delete(FileOutputFormat.getOutputPath(job), true);
    }

    // Since we have Kerberos enabled at LinkedIn, we must add the token to our configuration. If
    // you don't use Kerberos security for your Hadoop cluster, you don't need this code.
    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    // Submit the job for execution
    _logger.info(String.format("About to submit the job named %s", _name));
    boolean succeeded = job.waitForCompletion(true);

    // Before we return, display our custom counters for the job in the Azkaban logs
    long inputWords = job.getCounters().findCounter(WordCountCounters.INPUT_WORDS).getValue();
    _logger.info(String.format("Read a total of %d input words", inputWords));

    // Azkaban will not realize the Hadoop job failed unless you specifically throw an exception
    if (!succeeded) {
        throw new Exception(String.format("Azkaban job %s failed", _name));
    }
}

From source file:com.linkedin.oneclick.wordcount.WordCount.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration conf = getConf();

    Job job = new Job(conf, "Word Count");
    job.setJarByClass(WordCount.class);

    String workDirectory = args.length >= 1 ? args[0] : "wordcount";
    Path input = new Path(workDirectory, "input.txt");
    FileSystem fs = input.getFileSystem(conf);
    fs.mkdirs(input.getParent());//from  w w  w.j ava2s. co m
    copy(resourceInputStream(getClass().getResource("/onegin.txt")), createOutputStream(conf, input), conf);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(WordCountMapper.class);
    FileInputFormat.addInputPath(job, input);

    job.setCombinerClass(WordCountReducer.class);
    job.setReducerClass(WordCountReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);
    Path output = clean(conf, new Path(workDirectory, "wordcount"));
    FileOutputFormat.setOutputPath(job, output);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

public void run() throws Exception {
    LOGGER.info("Starting {}", getClass().getSimpleName());

    FileSystem fs = FileSystem.get(getConf());
    Path inputPathPattern = new Path(_inputSegmentDir);

    if (fs.exists(new Path(_stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(_stagingDir), true);
    }/*from w ww  . j  a  v a  2 s.c o  m*/
    fs.mkdirs(new Path(_stagingDir));
    fs.mkdirs(new Path(_stagingDir + "/input/"));

    if (fs.exists(new Path(_outputDir))) {
        LOGGER.warn("Found the output folder, deleting it");
        fs.delete(new Path(_outputDir), true);
    }
    fs.mkdirs(new Path(_outputDir));

    List<FileStatus> inputDataFiles = new ArrayList<FileStatus>();
    FileStatus[] fileStatusArr = fs.globStatus(inputPathPattern);
    for (FileStatus fileStatus : fileStatusArr) {
        inputDataFiles.addAll(getDataFilesFromPath(fs, fileStatus.getPath()));
    }

    for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
        FileStatus file = inputDataFiles.get(seqId);
        String completeFilePath = " " + file.getPath().toString() + " " + seqId;
        Path newOutPutFile = new Path((_stagingDir + "/input/"
                + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_') + ".txt"));
        FSDataOutputStream stream = fs.create(newOutPutFile);
        stream.writeUTF(completeFilePath);
        stream.flush();
        stream.close();
    }

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationJob.class);
    job.setJobName(_jobName);

    job.setMapperClass(HadoopSegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", new ObjectMapper().writeValueAsString(_dataSchema));

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : _properties.keySet()) {
        job.getConfiguration().set(key.toString(), _properties.getProperty(key.toString()));
    }

    if (_depsJarPath != null && _depsJarPath.length() > 0) {
        addDepsJarToDistributedCache(new Path(_depsJarPath), job);
    }

    // Submit the job for execution.
    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", _stagingDir + "/output/segmentTar", _outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(_stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(_outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", _stagingDir);
    fs.delete(new Path(_stagingDir), true);
}

From source file:com.linkedin.thirdeye.bootstrap.segment.create.SegmentCreationPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationPhaseJob.class);
    job.setJobName(name);// w  w  w .  j a v a2s  .  co  m

    FileSystem fs = FileSystem.get(getConf());

    Configuration configuration = job.getConfiguration();

    String schemaPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEMA_PATH);
    LOGGER.info("Schema path : {}", schemaPath);
    String configPath = getAndSetConfiguration(configuration, SEGMENT_CREATION_CONFIG_PATH);
    LOGGER.info("Config path : {}", configPath);
    Schema dataSchema = createSchema(configPath);
    LOGGER.info("Data schema : {}", dataSchema);
    String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH);
    LOGGER.info("Input path : {}", inputSegmentDir);
    String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputDir);
    String stagingDir = new File(outputDir, TEMP).getAbsolutePath();
    LOGGER.info("Staging dir : {}", stagingDir);
    String tableName = getAndSetConfiguration(configuration, SEGMENT_CREATION_SEGMENT_TABLE_NAME);
    LOGGER.info("Segment table name : {}", tableName);

    // Create temporary directory
    if (fs.exists(new Path(stagingDir))) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(new Path(stagingDir), true);
    }
    fs.mkdirs(new Path(stagingDir));
    fs.mkdirs(new Path(stagingDir + "/input/"));

    if (fs.exists(new Path(outputDir))) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(new Path(outputDir), true);
    }
    fs.mkdirs(new Path(outputDir));

    Path inputPathPattern = new Path(inputSegmentDir);
    List<FileStatus> inputDataFiles = Arrays.asList(fs.listStatus(inputPathPattern));
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((stagingDir + "/input/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set("data.schema", OBJECT_MAPPER.writeValueAsString(dataSchema));
    if (!fs.exists(new Path(schemaPath))) {
        OBJECT_MAPPER.writerWithDefaultPrettyPrinter().writeValue(fs.create(new Path(schemaPath), false),
                dataSchema);
    }

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", stagingDir);
    fs.delete(new Path(stagingDir), true);

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.aggregation.AggregationPhaseJob.java

License:Apache License

public Job run() throws Exception {
    Job job = Job.getInstance(getConf());
    job.setJobName(name);/*www. j a va  2 s. c o  m*/
    job.setJarByClass(AggregationPhaseJob.class);

    FileSystem fs = FileSystem.get(getConf());
    Configuration configuration = job.getConfiguration();

    // Properties
    LOGGER.info("Properties {}", props);

    // Input Path
    String inputPathDir = getAndSetConfiguration(configuration, AGG_PHASE_INPUT_PATH);
    LOGGER.info("Input path dir: " + inputPathDir);
    for (String inputPath : inputPathDir.split(ThirdEyeConstants.FIELD_SEPARATOR)) {
        LOGGER.info("Adding input:" + inputPath);
        Path input = new Path(inputPath);
        FileInputFormat.addInputPath(job, input);
    }

    // Output path
    Path outputPath = new Path(getAndSetConfiguration(configuration, AGG_PHASE_OUTPUT_PATH));
    LOGGER.info("Output path dir: " + outputPath.toString());
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);
    }
    FileOutputFormat.setOutputPath(job, outputPath);

    // Schema
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputPathDir);
    LOGGER.info("Schema : {}", avroSchema.toString(true));
    job.getConfiguration().set(AGG_PHASE_AVRO_SCHEMA.toString(), avroSchema.toString());

    // ThirdEyeConfig
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    LOGGER.info("Thirdeye Config {}", thirdeyeConfig.encode());
    job.getConfiguration().set(AGG_PHASE_THIRDEYE_CONFIG.toString(),
            OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

    // Map config
    job.setMapperClass(AggregationMapper.class);
    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapOutputKeyClass(BytesWritable.class);
    job.setMapOutputValueClass(BytesWritable.class);

    // Reduce config
    job.setReducerClass(AggregationReducer.class);
    job.setOutputKeyClass(AvroKey.class);
    job.setOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, avroSchema);
    job.setOutputFormatClass(AvroKeyOutputFormat.class);
    String numReducers = props.getProperty(ThirdEyeJobProperties.THIRDEYE_NUM_REDUCERS.getName());
    LOGGER.info("Num Reducers : {}", numReducers);
    if (StringUtils.isNotBlank(numReducers)) {
        job.setNumReduceTasks(Integer.valueOf(numReducers));
        LOGGER.info("Setting num reducers {}", job.getNumReduceTasks());
    }

    job.waitForCompletion(true);

    Counter counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS);
    LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
    if (counter.getValue() == 0) {
        throw new IllegalStateException("No input records in " + inputPathDir);
    }
    counter = job.getCounters().findCounter(AggregationCounter.NUMBER_OF_RECORDS_FLATTENED);
    LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());

    for (String metric : thirdeyeConfig.getMetricNames()) {
        counter = job.getCounters().findCounter(thirdeyeConfig.getCollection(), metric);
        LOGGER.info(counter.getDisplayName() + " : " + counter.getValue());
    }

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.backfill.BackfillPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());
    job.setJarByClass(BackfillPhaseJob.class);
    job.setJobName(name);//from  w w  w .  j av a2 s  . co  m

    FileSystem fs = FileSystem.get(getConf());
    Configuration configuration = job.getConfiguration();

    LOGGER.info("*******************************************************************************");
    String controllerHost = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_HOST);
    String controllerPort = getAndSetConfiguration(configuration, BACKFILL_PHASE_CONTROLLER_PORT);
    LOGGER.info("Controller Host : {} Controller Port : {}", controllerHost, controllerPort);
    String segmentStartTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_START_TIME);
    String segmentEndTime = getAndSetConfiguration(configuration, BACKFILL_PHASE_END_TIME);
    long startTime = Long.valueOf(segmentStartTime);
    long endTime = Long.valueOf(segmentEndTime);
    if (Long.valueOf(segmentStartTime) > Long.valueOf(segmentEndTime)) {
        throw new IllegalStateException("Start time cannot be greater than end time");
    }
    String tableName = getAndSetConfiguration(configuration, BACKFILL_PHASE_TABLE_NAME);
    LOGGER.info("Start time : {} End time : {} Table name : {}", segmentStartTime, segmentEndTime, tableName);

    String outputPath = getAndSetConfiguration(configuration, BACKFILL_PHASE_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputPath);
    Path backfillDir = new Path(outputPath);
    if (fs.exists(backfillDir)) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(backfillDir, true);
    }
    Path downloadDir = new Path(backfillDir, DOWNLOAD);
    LOGGER.info("Creating download dir : {}", downloadDir);
    fs.mkdirs(downloadDir);
    Path inputDir = new Path(backfillDir, INPUT);
    LOGGER.info("Creating input dir : {}", inputDir);
    fs.mkdirs(inputDir);
    Path outputDir = new Path(backfillDir, OUTPUT);
    LOGGER.info("Creating output dir : {}", outputDir);

    BackfillControllerAPIs backfillControllerAPIs = new BackfillControllerAPIs(controllerHost,
            Integer.valueOf(controllerPort), tableName);

    LOGGER.info("Downloading segments in range {} to {}", startTime, endTime);
    List<String> allSegments = backfillControllerAPIs.getAllSegments(tableName);
    List<String> segmentsToDownload = backfillControllerAPIs.findSegmentsInRange(tableName, allSegments,
            startTime, endTime);
    for (String segmentName : segmentsToDownload) {
        backfillControllerAPIs.downloadSegment(segmentName, downloadDir);
    }

    LOGGER.info("Reading downloaded segment input files");
    List<FileStatus> inputDataFiles = new ArrayList<>();
    inputDataFiles.addAll(Lists.newArrayList(fs.listStatus(downloadDir)));
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        LOGGER.info("Creating input files at {} for segment input files", inputDir);
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((inputDir + "/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(BackfillPhaseMapJob.BackfillMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);

    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Cleanup the working directory");
    LOGGER.info("Deleting the dir: {}", downloadDir);
    fs.delete(downloadDir, true);
    LOGGER.info("Deleting the dir: {}", inputDir);
    fs.delete(inputDir, true);
    LOGGER.info("Deleting the dir: {}", outputDir);
    fs.delete(outputDir, true);

    return job;
}

From source file:com.linkedin.thirdeye.hadoop.segment.creation.SegmentCreationPhaseJob.java

License:Apache License

public Job run() throws Exception {

    Job job = Job.getInstance(getConf());

    job.setJarByClass(SegmentCreationPhaseJob.class);
    job.setJobName(name);/*from   w w  w  .j  a  va  2s  .c  om*/

    FileSystem fs = FileSystem.get(getConf());

    Configuration configuration = job.getConfiguration();

    String inputSegmentDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_INPUT_PATH);
    LOGGER.info("Input path : {}", inputSegmentDir);
    Schema avroSchema = ThirdeyeAvroUtils.getSchema(inputSegmentDir);
    LOGGER.info("Schema : {}", avroSchema);
    String metricTypesProperty = ThirdeyeAvroUtils.getMetricTypesProperty(
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_NAMES.toString()),
            props.getProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString()), avroSchema);
    props.setProperty(ThirdEyeConfigProperties.THIRDEYE_METRIC_TYPES.toString(), metricTypesProperty);
    ThirdEyeConfig thirdeyeConfig = ThirdEyeConfig.fromProperties(props);
    LOGGER.info("ThirdEyeConfig {}", thirdeyeConfig.encode());
    String outputDir = getAndSetConfiguration(configuration, SEGMENT_CREATION_OUTPUT_PATH);
    LOGGER.info("Output path : {}", outputDir);
    Path stagingDir = new Path(outputDir, TEMP);
    LOGGER.info("Staging dir : {}", stagingDir);
    String segmentWallClockStart = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_START_TIME);
    LOGGER.info("Segment wallclock start time : {}", segmentWallClockStart);
    String segmentWallClockEnd = getAndSetConfiguration(configuration, SEGMENT_CREATION_WALLCLOCK_END_TIME);
    LOGGER.info("Segment wallclock end time : {}", segmentWallClockEnd);
    String schedule = getAndSetConfiguration(configuration, SEGMENT_CREATION_SCHEDULE);
    LOGGER.info("Segment schedule : {}", schedule);
    String isBackfill = props.getProperty(SEGMENT_CREATION_BACKFILL.toString(), DEFAULT_BACKFILL);
    configuration.set(SEGMENT_CREATION_BACKFILL.toString(), isBackfill);
    LOGGER.info("Is Backfill : {}", configuration.get(SEGMENT_CREATION_BACKFILL.toString()));

    // Create temporary directory
    if (fs.exists(stagingDir)) {
        LOGGER.warn("Found the temp folder, deleting it");
        fs.delete(stagingDir, true);
    }
    fs.mkdirs(stagingDir);
    fs.mkdirs(new Path(stagingDir + "/input/"));

    // Create output directory
    if (fs.exists(new Path(outputDir))) {
        LOGGER.warn("Found the output folder deleting it");
        fs.delete(new Path(outputDir), true);
    }
    fs.mkdirs(new Path(outputDir));

    // Read input files
    List<FileStatus> inputDataFiles = new ArrayList<>();
    for (String input : inputSegmentDir.split(",")) {
        Path inputPathPattern = new Path(input);
        inputDataFiles.addAll(Arrays.asList(fs.listStatus(inputPathPattern)));
    }
    LOGGER.info("size {}", inputDataFiles.size());

    try {
        for (int seqId = 0; seqId < inputDataFiles.size(); ++seqId) {
            FileStatus file = inputDataFiles.get(seqId);
            String completeFilePath = " " + file.getPath().toString() + " " + seqId;
            Path newOutPutFile = new Path((stagingDir + "/input/"
                    + file.getPath().toString().replace('.', '_').replace('/', '_').replace(':', '_')
                    + ".txt"));
            FSDataOutputStream stream = fs.create(newOutPutFile);
            LOGGER.info("wrote {}", completeFilePath);
            stream.writeUTF(completeFilePath);
            stream.flush();
            stream.close();
        }
    } catch (Exception e) {
        LOGGER.error("Exception while reading input files ", e);
    }

    job.setMapperClass(SegmentCreationPhaseMapReduceJob.SegmentCreationMapper.class);

    if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
        job.getConfiguration().set("mapreduce.job.credentials.binary",
                System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
    }

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job, new Path(stagingDir + "/input/"));
    FileOutputFormat.setOutputPath(job, new Path(stagingDir + "/output/"));

    job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
    job.getConfiguration().set(SEGMENT_CREATION_THIRDEYE_CONFIG.toString(),
            OBJECT_MAPPER.writeValueAsString(thirdeyeConfig));

    job.setMaxReduceAttempts(1);
    job.setMaxMapAttempts(0);
    job.setNumReduceTasks(0);
    for (Object key : props.keySet()) {
        job.getConfiguration().set(key.toString(), props.getProperty(key.toString()));
    }

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed : " + job);
    }

    LOGGER.info("Moving Segment Tar files from {} to: {}", stagingDir + "/output/segmentTar", outputDir);
    FileStatus[] segmentArr = fs.listStatus(new Path(stagingDir + "/output/segmentTar"));
    for (FileStatus segment : segmentArr) {
        fs.rename(segment.getPath(), new Path(outputDir, segment.getPath().getName()));
    }

    // Delete temporary directory.
    LOGGER.info("Cleanup the working directory.");
    LOGGER.info("Deleting the dir: {}", stagingDir);
    fs.delete(stagingDir, true);

    return job;
}

From source file:com.linkedin.whiteelephant.mapreduce.MyAvroMultipleOutputs.java

License:Apache License

private TaskAttemptContext getContext(String nameOutput) throws IOException {

    TaskAttemptContext taskContext = taskContexts.get(nameOutput);

    if (taskContext != null) {
        return taskContext;
    }/*w  ww . j a v  a2  s .c o m*/

    // The following trick leverages the instantiation of a record writer via
    // the job thus supporting arbitrary output formats.
    context.getConfiguration().set("avro.mo.config.namedOutput", nameOutput);
    Job job = new Job(context.getConfiguration());
    job.setOutputFormatClass(getNamedOutputFormatClass(context, nameOutput));
    Schema keySchema = keySchemas.get(nameOutput + "_KEYSCHEMA");
    Schema valSchema = valSchemas.get(nameOutput + "_VALSCHEMA");

    boolean isMaponly = job.getNumReduceTasks() == 0;

    if (keySchema != null) {
        if (isMaponly)
            AvroJob.setMapOutputKeySchema(job, keySchema);
        else
            AvroJob.setOutputKeySchema(job, keySchema);
    }
    if (valSchema != null) {
        if (isMaponly)
            AvroJob.setMapOutputValueSchema(job, valSchema);
        else
            AvroJob.setOutputValueSchema(job, valSchema);
    }
    taskContext = new TaskAttemptContext(job.getConfiguration(), context.getTaskAttemptID());

    taskContexts.put(nameOutput, taskContext);

    return taskContext;
}

From source file:com.luogh.learning.lab.hbase.IndexBuilderTest.java

License:Apache License

/**
 * perf_test_schema:perf_test_table_normal_with_partition_key w_cf value
 * Job configuration.//from   www .j a va 2s. c  om
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    String tableName = args[0];
    String columnFamily = args[1];
    System.out.println("****" + tableName);
    conf.set(TableInputFormat.INPUT_TABLE, tableName);
    conf.set("index.tablename", tableName);
    conf.set("index.familyname", columnFamily);
    String[] fields = new String[args.length - 2];
    System.arraycopy(args, 2, fields, 0, fields.length);
    conf.setStrings("index.fields", fields);
    Job job = Job.getInstance(conf);
    TableMapReduceUtil.initTableMapperJob(Lists.newArrayList(new Scan()), Map.class,
            ImmutableBytesWritable.class, Put.class, job);
    job.setJarByClass(IndexBuilderTest.class);
    job.setMapperClass(Map.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(TableInputFormat.class);
    job.setOutputFormatClass(MultiTableOutputFormat.class);
    return job;
}

From source file:com.mapr.db.utils.ImportCSV_MR.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args.length != 4) {
        System.out.println("MapR-DB JSON Tables - Import CSV" + "\nUsage:\n"
                + "\tParam 1: JSON Table Path (MapR-FS)\n" + "\tParam 2: Text File Path (Local-FS)\n"
                + "\tParam 3: Text File Delimiter (Local-FS)\n" + "\tParam 4: Schema File Path (Local-FS)\n");

        System.exit(-1);/*  w w w  .j  a  v  a2  s .  c  o m*/
    }

    outputTable = args[0].toString().trim();
    inputDir = args[1].toString().trim();
    delimiter = args[2].toString().trim();
    schemaFile = args[3].toString().trim();

    BasicConfigurator.configure();
    Logger.getRootLogger().setLevel(Level.ERROR);

    ImportCSV_MR imp = new ImportCSV_MR();

    imp.readSchema(schemaFile);
    imp.printSchema();

    Job job = Job.getInstance(conf, "ImportCSV_MR");
    job.setJarByClass(ImportCSV_MR.class);

    job.setMapperClass(MyMapper.class);

    conf = job.getConfiguration();
    conf.setStrings("io.serializations",
            new String[] { conf.get("io.serializations"), JSONDocumentSerialization.class.getName() });

    conf.set("countColumnsInSchema", String.valueOf(countColumnsInSchema));

    conf.set("delimiter", delimiter);

    conf.set("tablePath", outputTable);

    String valueTypes[] = valueTypesInSchema.toArray(new String[valueTypesInSchema.size()]);
    conf.setStrings("valueTypesInSchema", valueTypes);

    String columnNames[] = columnNamesInSchema.toArray(new String[columnNamesInSchema.size()]);
    conf.setStrings("columnNamesInSchema", columnNames);

    //Deciding the appropriate Input format class along with their input path
    FileInputFormat.addInputPath(job, new Path(inputDir));
    job.setInputFormatClass(TextInputFormat.class);

    //Mapper output record key and value class
    job.setMapOutputKeyClass(ByteBufWritableComparable.class);
    job.setMapOutputValueClass(DBDocumentImpl.class);

    //Deciding the appropriate Output format class along with their input path
    conf.set("maprdb.mapred.outputtable", outputTable);
    job.setOutputFormatClass(TableOutputFormat.class);

    //Reducer output record key and value class
    job.setNumReduceTasks(0);

    boolean isJobSuccessful = job.waitForCompletion(true);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
    return 0;
}