Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java

License:Apache License

/**
 * Returns the {@link ConflictResolution} mode for this commit.
 *
 * @param context the JobContext for this commit
 * @return the ConflictResolution mode//from  w ww  .  ja v  a2s . c o  m
 */
protected final ConflictResolution getMode(JobContext context) {
    if (mode == null) {
        this.mode = ConflictResolution.valueOf(
                context.getConfiguration().get(S3Committer.CONFLICT_MODE, "fail").toUpperCase(Locale.ENGLISH));
    }
    return mode;
}

From source file:com.netflix.bdp.s3.S3PartitionedOutputCommitter.java

License:Apache License

@Override
public void commitJob(JobContext context) throws IOException {
    List<S3Util.PendingUpload> pending = getPendingUploads(context);

    FileSystem s3 = getOutputPath(context).getFileSystem(context.getConfiguration());
    Set<Path> partitions = Sets.newLinkedHashSet();
    LOG.info("The partitions are: " + partitions);
    for (S3Util.PendingUpload commit : pending) {
        Path filePath = new Path("s3://" + commit.getBucketName() + "/" + commit.getKey());
        partitions.add(filePath.getParent());
    }/* ww  w. j a va  2s  .  c o  m*/

    // enforce conflict resolution
    boolean threw = true;
    try {
        switch (getMode(context)) {
        case FAIL:
            // FAIL checking is done on the task side, so this does nothing
            break;
        case APPEND:
            // no check is needed because the output may exist for appending
            break;
        case REPLACE:
            for (Path partitionPath : partitions) {
                if (s3.exists(partitionPath)) {
                    LOG.info("Removing partition path to be replaced: " + partitionPath);
                    if (!s3.delete(partitionPath, true /* recursive */)) {
                        throw new IOException("Failed to delete existing " + "partition directory for replace:"
                                + partitionPath);
                    }
                }
            }
            break;
        default:
            throw new RuntimeException("Unknown conflict resolution mode: " + getMode(context));
        }

        threw = false;

    } catch (IOException e) {
        throw new IOException("Failed to enforce conflict resolution", e);

    } finally {
        if (threw) {
            abortJobInternal(context, pending, threw);
        }
    }

    commitJobInternal(context, pending);
}

From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java

License:Apache License

private static Set<String> runTasks(JobContext job, int numTasks, int numFiles) throws IOException {
    Set<String> uploads = Sets.newHashSet();

    for (int taskId = 0; taskId < numTasks; taskId += 1) {
        TaskAttemptID attemptID = new TaskAttemptID(new TaskID(JOB_ID, TaskType.REDUCE, taskId),
                (taskId * 37) % numTasks);
        TaskAttemptContext attempt = new TaskAttemptContextImpl(new Configuration(job.getConfiguration()),
                attemptID);//from   ww w . j a  v  a 2  s  .com
        MockedS3Committer taskCommitter = new MockedS3Committer(S3_OUTPUT_PATH, attempt);
        commitTask(taskCommitter, attempt, numFiles);
        uploads.addAll(taskCommitter.results.getUploads());
    }

    return uploads;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * Get the list of input {@link Path}s for the map-reduce job.
 *
 * @param context The job//  w  w  w.j  a v a2s. c  om
 * @return the list of input {@link Path}s for the map-reduce job.
 */
public static Path[] getInputPaths(JobContext context) {
    String dirs = context.getConfiguration().get("mapred.input.dir", "");
    String[] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
        result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * List input directories./* w ww  .j a va 2  s.  c  o  m*/
 *
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // Get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();
    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        final SmilePathFilter filter = new SmilePathFilter();
        FileStatus[] matches = fs.globStatus(p, filter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), filter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }

    return result;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    final List<InputSplit> splits = new ArrayList<InputSplit>();
    final List<FileStatus> files = listStatus(jobContext);
    for (FileStatus file : files) {
        final Path path = file.getPath();
        final FileSystem fs = path.getFileSystem(jobContext.getConfiguration());
        final BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
        final List<String> blkHosts = new ArrayList<String>();
        for (final BlockLocation location : blkLocations) {
            blkHosts.addAll(Arrays.asList(location.getHosts()));
        }// w w w.  j  a  v a2 s.  c om

        // TODO Split files =)
        final String[] hosts = blkHosts.toArray(new String[0]);
        splits.add(new FileSplit(path, 0, file.getLen(), hosts));
    }

    return splits;
}

From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * /*  www . j  ava 2s  .  c  o m*/
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.pivotal.hawq.mapreduce.ao.HAWQAOInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 * /*from   www .  ja va 2s.c  o m*/
 * @param job
 *            the job context
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < fileStatuses.length; ++i) {
        HAWQAOFileStatus aofilestatus = fileStatuses[i];
        String pathStr = aofilestatus.getFilePath();
        long fileLength = aofilestatus.getFileLength();
        if (fileLength == 0)
            continue;

        boolean checksum = aofilestatus.getChecksum();
        String compressType = aofilestatus.getCompressType();
        int blocksize = aofilestatus.getBlockSize();
        Path path = new Path(pathStr);
        if (fileLength != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(fs.getFileStatus(path), 0, fileLength);
            // not splitable
            splits.add(new HAWQAOSplit(path, 0, fileLength, blkLocations[0].getHosts(), checksum, compressType,
                    blocksize));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new HAWQAOSplit(path, 0, fileLength, new String[0], checksum, compressType, blocksize));
        }
    }
    job.getConfiguration().setLong(NUM_INPUT_FILES, splits.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:com.pivotal.hawq.mapreduce.HAWQInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    HAWQTableFormat tableFormat = getTableFormat(job.getConfiguration());

    switch (tableFormat) {
    case AO://from w ww.j a v  a2 s.  c  o m
        return aoInputFormat.getSplits(job);
    case Parquet:
        return parquetInputFormat.getSplits(job);
    default:
        throw new AssertionError("invalid table format: " + tableFormat);
    }
}

From source file:com.pivotal.hawq.mapreduce.parquet.HAWQParquetInputFormat.java

License:Apache License

@Override
protected List<FileStatus> listStatus(JobContext jobContext) throws IOException {
    List<FileStatus> result = Lists.newArrayList();
    for (HAWQFileStatus hawqFileStatus : hawqFileStatuses) {
        if (hawqFileStatus.getFileLength() == 0)
            continue; // skip empty file

        Path path = new Path(hawqFileStatus.getFilePath());
        FileSystem fs = path.getFileSystem(jobContext.getConfiguration());
        FileStatus dfsStat = fs.getFileStatus(path);
        // rewrite file length because HAWQ records the logicalEOF of file, which may
        // be smaller than the file's actual EOF
        FileStatus hawqStat = new FileStatus(hawqFileStatus.getFileLength(), // rewrite to logicalEOF
                dfsStat.isDirectory(), dfsStat.getReplication(), dfsStat.getBlockSize(),
                dfsStat.getModificationTime(), dfsStat.getAccessTime(), dfsStat.getPermission(),
                dfsStat.getOwner(), dfsStat.getGroup(), dfsStat.getPath());
        result.add(hawqStat);/*from  w  w  w  .  j  av  a 2 s. com*/
    }

    return result;
}