List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.cloudera.ByteBufferInputFormat.java
License:Apache License
@Override protected boolean isSplitable(JobContext context, Path file) { final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file); return (null == codec); }
From source file:com.cloudera.crunch.impl.mr.run.CrunchInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = Lists.newArrayList(); Configuration conf = job.getConfiguration(); Job jobCopy = new Job(conf); Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs .getFormatNodeMap(jobCopy);//from w w w. j a va 2 s.c om // First, build a map of InputFormats to Paths for (Map.Entry<Class<? extends InputFormat>, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) { Class<? extends InputFormat> formatClass = entry.getKey(); InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf); for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) { Integer nodeIndex = nodeEntry.getKey(); List<Path> paths = nodeEntry.getValue(); FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> pathSplits = format.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new CrunchInputSplit(pathSplit, formatClass, nodeIndex, conf)); } } } return splits; }
From source file:com.cloudera.crunch.impl.mr.run.CrunchInputs.java
License:Apache License
public static Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) { Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap(); Configuration conf = job.getConfiguration(); for (String input : Splitter.on(RECORD_SEP).split(conf.get(RuntimeParameters.MULTI_INPUTS))) { List<String> fields = ImmutableList.copyOf(SPLITTER.split(input)); Class<? extends InputFormat> inputFormatClass; try {/* w w w. jav a2 s. c om*/ inputFormatClass = (Class<? extends InputFormat>) conf.getClassByName(fields.get(0)); } catch (ClassNotFoundException e) { throw new RuntimeException(e); } if (!formatNodeMap.containsKey(inputFormatClass)) { formatNodeMap.put(inputFormatClass, Maps.<Integer, List<Path>>newHashMap()); } Integer nodeIndex = Integer.valueOf(fields.get(1)); if (!formatNodeMap.get(inputFormatClass).containsKey(nodeIndex)) { formatNodeMap.get(inputFormatClass).put(nodeIndex, Lists.<Path>newLinkedList()); } formatNodeMap.get(inputFormatClass).get(nodeIndex).add(new Path(fields.get(2))); } return formatNodeMap; }
From source file:com.cloudera.recordservice.examples.terasort.RecordServiceTeraInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { PlanUtil.SplitsInfo info = PlanUtil.getSplits(context.getConfiguration(), context.getCredentials()); if (info.schema.getNumColumns() != 1 || info.schema.getColumnInfo(0).type.typeId != Schema.Type.STRING) { throw new IOException("Invalid data. Expecting schema to be a single STRING."); }//from w ww.j a v a 2 s . c o m return info.splits; }
From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java
License:Apache License
static long getNumberOfRows(JobContext job) { return job.getConfiguration().getLong(NUM_ROWS, 0); }
From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample/*from w ww . j a va2 s. c om*/ * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } @Override public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { if (job == lastContext) { return lastResult; }// w ww . ja v a2 s .c o m long t1, t2, t3; t1 = System.currentTimeMillis(); lastContext = job; lastResult = super.getSplits(job); t2 = System.currentTimeMillis(); System.out.println("Spent " + (t2 - t1) + "ms computing base-splits."); if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) { TeraScheduler scheduler = new TeraScheduler(lastResult.toArray(new FileSplit[0]), job.getConfiguration()); lastResult = scheduler.getNewFileSplits(); t3 = System.currentTimeMillis(); System.out.println("Spent " + (t3 - t2) + "ms computing TeraScheduler splits."); } return lastResult; }
From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java
License:Apache License
/** * Set the requirement for a final sync before the stream is closed. *//* www . j a v a 2s. c o m*/ static void setFinalSync(JobContext job, boolean newValue) { job.getConfiguration().setBoolean(FINAL_SYNC_ATTRIBUTE, newValue); }
From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java
License:Apache License
/** * Does the user want a final sync at close? */// w w w . j ava2s. c o m public static boolean getFinalSync(JobContext job) { return job.getConfiguration().getBoolean(FINAL_SYNC_ATTRIBUTE, false); }
From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java
License:Apache License
@Override public void checkOutputSpecs(JobContext job) throws InvalidJobConfException, IOException { // Ensure that the output directory is set Path outDir = getOutputPath(job); if (outDir == null) { throw new InvalidJobConfException("Output directory not set in JobConf."); }// w w w . ja va 2 s .co m // get delegation token for outDir's file system TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration()); }