Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java

License:Apache License

/**
 * Returns the {@link ConflictResolution} mode for this commit.
 *
 * @param context the JobContext for this commit
 * @return the ConflictResolution mode//from  w ww  .  ja v  a2s . c o  m
 */
protected final ConflictResolution getMode(JobContext context) {
    if (mode == null) {
        this.mode = ConflictResolution.valueOf(
                context.getConfiguration().get(S3Committer.CONFLICT_MODE, "fail").toUpperCase(Locale.ENGLISH));
    }
    return mode;
}

From source file:com.netflix.bdp.s3.S3PartitionedOutputCommitter.java

License:Apache License

@Override
public void commitJob(JobContext context) throws IOException {
    List<S3Util.PendingUpload> pending = getPendingUploads(context);

    FileSystem s3 = getOutputPath(context).getFileSystem(context.getConfiguration());
    Set<Path> partitions = Sets.newLinkedHashSet();
    LOG.info("The partitions are: " + partitions);
    for (S3Util.PendingUpload commit : pending) {
        Path filePath = new Path("s3://" + commit.getBucketName() + "/" + commit.getKey());
        partitions.add(filePath.getParent());
    }/* ww  w. j a va  2s  .  c o  m*/

    // enforce conflict resolution
    boolean threw = true;
    try {
        switch (getMode(context)) {
        case FAIL:
            // FAIL checking is done on the task side, so this does nothing
            break;
        case APPEND:
            // no check is needed because the output may exist for appending
            break;
        case REPLACE:
            for (Path partitionPath : partitions) {
                if (s3.exists(partitionPath)) {
                    LOG.info("Removing partition path to be replaced: " + partitionPath);
                    if (!s3.delete(partitionPath, true /* recursive */)) {
                        throw new IOException("Failed to delete existing " + "partition directory for replace:"
                                + partitionPath);
                    }
                }
            }
            break;
        default:
            throw new RuntimeException("Unknown conflict resolution mode: " + getMode(context));
        }

        threw = false;

    } catch (IOException e) {
        throw new IOException("Failed to enforce conflict resolution", e);

    } finally {
        if (threw) {
            abortJobInternal(context, pending, threw);
        }
    }

    commitJobInternal(context, pending);
}

From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java

License:Apache License

private static Set<String> runTasks(JobContext job, int numTasks, int numFiles) throws IOException {
    Set<String> uploads = Sets.newHashSet();

    for (int taskId = 0; taskId < numTasks; taskId += 1) {
        TaskAttemptID attemptID = new TaskAttemptID(new TaskID(JOB_ID, TaskType.REDUCE, taskId),
                (taskId * 37) % numTasks);
        TaskAttemptContext attempt = new TaskAttemptContextImpl(new Configuration(job.getConfiguration()),
                attemptID);//from   ww w . j a  v  a 2  s  .com
        MockedS3Committer taskCommitter = new MockedS3Committer(S3_OUTPUT_PATH, attempt);
        commitTask(taskCommitter, attempt, numFiles);
        uploads.addAll(taskCommitter.results.getUploads());
    }

    return uploads;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * Get the list of input {@link Path}s for the map-reduce job.
 *
 * @param context The job//  w  w  w.j  a v a2s. c  om
 * @return the list of input {@link Path}s for the map-reduce job.
 */
public static Path[] getInputPaths(JobContext context) {
    String dirs = context.getConfiguration().get("mapred.input.dir", "");
    String[] list = StringUtils.split(dirs);
    Path[] result = new Path[list.length];
    for (int i = 0; i < list.length; i++) {
        result[i] = new Path(StringUtils.unEscapeString(list[i]));
    }
    return result;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

/**
 * List input directories./* w ww  .j a va 2  s.  c  o  m*/
 *
 * @param job the job to list input paths for
 * @return array of FileStatus objects
 * @throws IOException if zero items.
 */
protected List<FileStatus> listStatus(JobContext job) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    Path[] dirs = getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }

    // Get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

    List<IOException> errors = new ArrayList<IOException>();
    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        final SmilePathFilter filter = new SmilePathFilter();
        FileStatus[] matches = fs.globStatus(p, filter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDir()) {
                    Collections.addAll(result, fs.listStatus(globStat.getPath(), filter));
                } else {
                    result.add(globStat);
                }
            }
        }
    }

    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }

    return result;
}

From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException {
    final List<InputSplit> splits = new ArrayList<InputSplit>();
    final List<FileStatus> files = listStatus(jobContext);
    for (FileStatus file : files) {
        final Path path = file.getPath();
        final FileSystem fs = path.getFileSystem(jobContext.getConfiguration());
        final BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen());
        final List<String> blkHosts = new ArrayList<String>();
        for (final BlockLocation location : blkLocations) {
            blkHosts.addAll(Arrays.asList(location.getHosts()));
        }// w w w.  j  a  v a2 s.  c om

        // TODO Split files =)
        final String[] hosts = blkHosts.toArray(new String[0]);
        splits.add(new FileSplit(path, 0, file.getLen(), hosts));
    }

    return splits;
}

From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * /*  www . j  ava 2s  .  c  o m*/
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.pivotal.hawq.mapreduce.ao.HAWQAOInputFormat.java

License:Apache License

/**
 * Generate the list of files and make them into FileSplits.
 * /*from   www .  ja va 2s.c  o m*/
 * @param job
 *            the job context
 * @throws IOException
 */
@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>();
    for (int i = 0; i < fileStatuses.length; ++i) {
        HAWQAOFileStatus aofilestatus = fileStatuses[i];
        String pathStr = aofilestatus.getFilePath();
        long fileLength = aofilestatus.getFileLength();
        if (fileLength == 0)
            continue;

        boolean checksum = aofilestatus.getChecksum();
        String compressType = aofilestatus.getCompressType();
        int blocksize = aofilestatus.getBlockSize();
        Path path = new Path(pathStr);
        if (fileLength != 0) {
            FileSystem fs = path.getFileSystem(job.getConfiguration());
            BlockLocation[] blkLocations = fs.getFileBlockLocations(fs.getFileStatus(path), 0, fileLength);
            // not splitable
            splits.add(new HAWQAOSplit(path, 0, fileLength, blkLocations[0].getHosts(), checksum, compressType,
                    blocksize));
        } else {
            // Create empty hosts array for zero length files
            splits.add(new HAWQAOSplit(path, 0, fileLength, new String[0], checksum, compressType, blocksize));
        }
    }
    job.getConfiguration().setLong(NUM_INPUT_FILES, splits.size());
    LOG.debug("Total # of splits: " + splits.size());
    return splits;
}

From source file:com.pivotal.hawq.mapreduce.HAWQInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    HAWQTableFormat tableFormat = getTableFormat(job.getConfiguration());

    switch (tableFormat) {
    case AO://from w ww.j a v  a2 s.  c  o m
        return aoInputFormat.getSplits(job);
    case Parquet:
        return parquetInputFormat.getSplits(job);
    default:
        throw new AssertionError("invalid table format: " + tableFormat);
    }
}

From source file:com.pivotal.hawq.mapreduce.parquet.HAWQParquetInputFormat.java

License:Apache License

@Override
protected List<FileStatus> listStatus(JobContext jobContext) throws IOException {
    List<FileStatus> result = Lists.newArrayList();
    for (HAWQFileStatus hawqFileStatus : hawqFileStatuses) {
        if (hawqFileStatus.getFileLength() == 0)
            continue; // skip empty file

        Path path = new Path(hawqFileStatus.getFilePath());
        FileSystem fs = path.getFileSystem(jobContext.getConfiguration());
        FileStatus dfsStat = fs.getFileStatus(path);
        // rewrite file length because HAWQ records the logicalEOF of file, which may
        // be smaller than the file's actual EOF
        FileStatus hawqStat = new FileStatus(hawqFileStatus.getFileLength(), // rewrite to logicalEOF
                dfsStat.isDirectory(), dfsStat.getReplication(), dfsStat.getBlockSize(),
                dfsStat.getModificationTime(), dfsStat.getAccessTime(), dfsStat.getPermission(),
                dfsStat.getOwner(), dfsStat.getGroup(), dfsStat.getPath());
        result.add(hawqStat);/*from  w  w  w  .  j  av  a 2 s. com*/
    }

    return result;
}