Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.cloudera.ByteBufferInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return (null == codec);
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = Lists.newArrayList();
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs
            .getFormatNodeMap(jobCopy);//from  w  w w. j a va  2  s.c  om

    // First, build a map of InputFormats to Paths
    for (Map.Entry<Class<? extends InputFormat>, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
        Class<? extends InputFormat> formatClass = entry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
            Integer nodeIndex = nodeEntry.getKey();
            List<Path> paths = nodeEntry.getValue();
            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new CrunchInputSplit(pathSplit, formatClass, nodeIndex, conf));
            }
        }
    }
    return splits;
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputs.java

License:Apache License

public static Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) {
    Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap();
    Configuration conf = job.getConfiguration();
    for (String input : Splitter.on(RECORD_SEP).split(conf.get(RuntimeParameters.MULTI_INPUTS))) {
        List<String> fields = ImmutableList.copyOf(SPLITTER.split(input));
        Class<? extends InputFormat> inputFormatClass;
        try {/*  w  w w.  jav  a2 s. c om*/
            inputFormatClass = (Class<? extends InputFormat>) conf.getClassByName(fields.get(0));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
        if (!formatNodeMap.containsKey(inputFormatClass)) {
            formatNodeMap.put(inputFormatClass, Maps.<Integer, List<Path>>newHashMap());
        }
        Integer nodeIndex = Integer.valueOf(fields.get(1));
        if (!formatNodeMap.get(inputFormatClass).containsKey(nodeIndex)) {
            formatNodeMap.get(inputFormatClass).put(nodeIndex, Lists.<Path>newLinkedList());
        }
        formatNodeMap.get(inputFormatClass).get(nodeIndex).add(new Path(fields.get(2)));
    }
    return formatNodeMap;
}

From source file:com.cloudera.recordservice.examples.terasort.RecordServiceTeraInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    PlanUtil.SplitsInfo info = PlanUtil.getSplits(context.getConfiguration(), context.getCredentials());
    if (info.schema.getNumColumns() != 1 || info.schema.getColumnInfo(0).type.typeId != Schema.Type.STRING) {
        throw new IOException("Invalid data. Expecting schema to be a single STRING.");
    }//from  w ww.j a  v a  2  s  . c  o  m
    return info.splits;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java

License:Apache License

static long getNumberOfRows(JobContext job) {
    return job.getConfiguration().getLong(NUM_ROWS, 0);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample/*from   w ww . j a va2 s. c om*/
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            @Override
            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    if (job == lastContext) {
        return lastResult;
    }//  w ww  .  ja v a2  s .c  o  m
    long t1, t2, t3;
    t1 = System.currentTimeMillis();
    lastContext = job;
    lastResult = super.getSplits(job);
    t2 = System.currentTimeMillis();
    System.out.println("Spent " + (t2 - t1) + "ms computing base-splits.");
    if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) {
        TeraScheduler scheduler = new TeraScheduler(lastResult.toArray(new FileSplit[0]),
                job.getConfiguration());
        lastResult = scheduler.getNewFileSplits();
        t3 = System.currentTimeMillis();
        System.out.println("Spent " + (t3 - t2) + "ms computing TeraScheduler splits.");
    }
    return lastResult;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java

License:Apache License

/**
 * Set the requirement for a final sync before the stream is closed.
 *//*  www . j  a v  a 2s.  c  o m*/
static void setFinalSync(JobContext job, boolean newValue) {
    job.getConfiguration().setBoolean(FINAL_SYNC_ATTRIBUTE, newValue);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java

License:Apache License

/**
 * Does the user want a final sync at close?
 *///  w  w  w  . j ava2s. c  o  m
public static boolean getFinalSync(JobContext job) {
    return job.getConfiguration().getBoolean(FINAL_SYNC_ATTRIBUTE, false);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext job) throws InvalidJobConfException, IOException {
    // Ensure that the output directory is set
    Path outDir = getOutputPath(job);
    if (outDir == null) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }// w w w  .  ja  va  2  s  .co  m

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration());
}