List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.netflix.bdp.s3.S3MultipartOutputCommitter.java
License:Apache License
/** * Returns the {@link ConflictResolution} mode for this commit. * * @param context the JobContext for this commit * @return the ConflictResolution mode//from w ww . ja v a2s . c o m */ protected final ConflictResolution getMode(JobContext context) { if (mode == null) { this.mode = ConflictResolution.valueOf( context.getConfiguration().get(S3Committer.CONFLICT_MODE, "fail").toUpperCase(Locale.ENGLISH)); } return mode; }
From source file:com.netflix.bdp.s3.S3PartitionedOutputCommitter.java
License:Apache License
@Override public void commitJob(JobContext context) throws IOException { List<S3Util.PendingUpload> pending = getPendingUploads(context); FileSystem s3 = getOutputPath(context).getFileSystem(context.getConfiguration()); Set<Path> partitions = Sets.newLinkedHashSet(); LOG.info("The partitions are: " + partitions); for (S3Util.PendingUpload commit : pending) { Path filePath = new Path("s3://" + commit.getBucketName() + "/" + commit.getKey()); partitions.add(filePath.getParent()); }/* ww w. j a va 2s . c o m*/ // enforce conflict resolution boolean threw = true; try { switch (getMode(context)) { case FAIL: // FAIL checking is done on the task side, so this does nothing break; case APPEND: // no check is needed because the output may exist for appending break; case REPLACE: for (Path partitionPath : partitions) { if (s3.exists(partitionPath)) { LOG.info("Removing partition path to be replaced: " + partitionPath); if (!s3.delete(partitionPath, true /* recursive */)) { throw new IOException("Failed to delete existing " + "partition directory for replace:" + partitionPath); } } } break; default: throw new RuntimeException("Unknown conflict resolution mode: " + getMode(context)); } threw = false; } catch (IOException e) { throw new IOException("Failed to enforce conflict resolution", e); } finally { if (threw) { abortJobInternal(context, pending, threw); } } commitJobInternal(context, pending); }
From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java
License:Apache License
private static Set<String> runTasks(JobContext job, int numTasks, int numFiles) throws IOException { Set<String> uploads = Sets.newHashSet(); for (int taskId = 0; taskId < numTasks; taskId += 1) { TaskAttemptID attemptID = new TaskAttemptID(new TaskID(JOB_ID, TaskType.REDUCE, taskId), (taskId * 37) % numTasks); TaskAttemptContext attempt = new TaskAttemptContextImpl(new Configuration(job.getConfiguration()), attemptID);//from ww w . j a v a 2 s .com MockedS3Committer taskCommitter = new MockedS3Committer(S3_OUTPUT_PATH, attempt); commitTask(taskCommitter, attempt, numFiles); uploads.addAll(taskCommitter.results.getUploads()); } return uploads; }
From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java
License:Apache License
/** * Get the list of input {@link Path}s for the map-reduce job. * * @param context The job// w w w.j a v a2s. c om * @return the list of input {@link Path}s for the map-reduce job. */ public static Path[] getInputPaths(JobContext context) { String dirs = context.getConfiguration().get("mapred.input.dir", ""); String[] list = StringUtils.split(dirs); Path[] result = new Path[list.length]; for (int i = 0; i < list.length; i++) { result[i] = new Path(StringUtils.unEscapeString(list[i])); } return result; }
From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java
License:Apache License
/** * List input directories./* w ww .j a va 2 s. c o m*/ * * @param job the job to list input paths for * @return array of FileStatus objects * @throws IOException if zero items. */ protected List<FileStatus> listStatus(JobContext job) throws IOException { List<FileStatus> result = new ArrayList<FileStatus>(); Path[] dirs = getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } // Get tokens for all the required FileSystems.. TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration()); List<IOException> errors = new ArrayList<IOException>(); for (Path p : dirs) { FileSystem fs = p.getFileSystem(job.getConfiguration()); final SmilePathFilter filter = new SmilePathFilter(); FileStatus[] matches = fs.globStatus(p, filter); if (matches == null) { errors.add(new IOException("Input path does not exist: " + p)); } else if (matches.length == 0) { errors.add(new IOException("Input Pattern " + p + " matches 0 files")); } else { for (FileStatus globStat : matches) { if (globStat.isDir()) { Collections.addAll(result, fs.listStatus(globStat.getPath(), filter)); } else { result.add(globStat); } } } } if (!errors.isEmpty()) { throw new InvalidInputException(errors); } return result; }
From source file:com.ning.metrics.serialization.hadoop.SmileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException, InterruptedException { final List<InputSplit> splits = new ArrayList<InputSplit>(); final List<FileStatus> files = listStatus(jobContext); for (FileStatus file : files) { final Path path = file.getPath(); final FileSystem fs = path.getFileSystem(jobContext.getConfiguration()); final BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, file.getLen()); final List<String> blkHosts = new ArrayList<String>(); for (final BlockLocation location : blkLocations) { blkHosts.addAll(Arrays.asList(location.getHosts())); }// w w w. j a v a2 s. c om // TODO Split files =) final String[] hosts = blkHosts.toArray(new String[0]); splits.add(new FileSplit(path, 0, file.getLen(), hosts)); } return splits; }
From source file:com.phantom.hadoop.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * /* www . j ava 2s . c o m*/ * @param job * the job to sample * @param partFile * where to write the output file to * @throws Throwable * if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.pivotal.hawq.mapreduce.ao.HAWQAOInputFormat.java
License:Apache License
/** * Generate the list of files and make them into FileSplits. * /*from www . ja va 2s.c o m*/ * @param job * the job context * @throws IOException */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = new ArrayList<InputSplit>(); for (int i = 0; i < fileStatuses.length; ++i) { HAWQAOFileStatus aofilestatus = fileStatuses[i]; String pathStr = aofilestatus.getFilePath(); long fileLength = aofilestatus.getFileLength(); if (fileLength == 0) continue; boolean checksum = aofilestatus.getChecksum(); String compressType = aofilestatus.getCompressType(); int blocksize = aofilestatus.getBlockSize(); Path path = new Path(pathStr); if (fileLength != 0) { FileSystem fs = path.getFileSystem(job.getConfiguration()); BlockLocation[] blkLocations = fs.getFileBlockLocations(fs.getFileStatus(path), 0, fileLength); // not splitable splits.add(new HAWQAOSplit(path, 0, fileLength, blkLocations[0].getHosts(), checksum, compressType, blocksize)); } else { // Create empty hosts array for zero length files splits.add(new HAWQAOSplit(path, 0, fileLength, new String[0], checksum, compressType, blocksize)); } } job.getConfiguration().setLong(NUM_INPUT_FILES, splits.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:com.pivotal.hawq.mapreduce.HAWQInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { HAWQTableFormat tableFormat = getTableFormat(job.getConfiguration()); switch (tableFormat) { case AO://from w ww.j a v a2 s. c o m return aoInputFormat.getSplits(job); case Parquet: return parquetInputFormat.getSplits(job); default: throw new AssertionError("invalid table format: " + tableFormat); } }
From source file:com.pivotal.hawq.mapreduce.parquet.HAWQParquetInputFormat.java
License:Apache License
@Override protected List<FileStatus> listStatus(JobContext jobContext) throws IOException { List<FileStatus> result = Lists.newArrayList(); for (HAWQFileStatus hawqFileStatus : hawqFileStatuses) { if (hawqFileStatus.getFileLength() == 0) continue; // skip empty file Path path = new Path(hawqFileStatus.getFilePath()); FileSystem fs = path.getFileSystem(jobContext.getConfiguration()); FileStatus dfsStat = fs.getFileStatus(path); // rewrite file length because HAWQ records the logicalEOF of file, which may // be smaller than the file's actual EOF FileStatus hawqStat = new FileStatus(hawqFileStatus.getFileLength(), // rewrite to logicalEOF dfsStat.isDirectory(), dfsStat.getReplication(), dfsStat.getBlockSize(), dfsStat.getModificationTime(), dfsStat.getAccessTime(), dfsStat.getPermission(), dfsStat.getOwner(), dfsStat.getGroup(), dfsStat.getPath()); result.add(hawqStat);/*from w w w . j av a 2 s. com*/ } return result; }