Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.cloudera.ByteBufferInputFormat.java

License:Apache License

@Override
protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec = new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    return (null == codec);
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {
    List<InputSplit> splits = Lists.newArrayList();
    Configuration conf = job.getConfiguration();
    Job jobCopy = new Job(conf);
    Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs
            .getFormatNodeMap(jobCopy);//from  w  w w. j a va  2  s.c  om

    // First, build a map of InputFormats to Paths
    for (Map.Entry<Class<? extends InputFormat>, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) {
        Class<? extends InputFormat> formatClass = entry.getKey();
        InputFormat format = (InputFormat) ReflectionUtils.newInstance(formatClass, conf);
        for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) {
            Integer nodeIndex = nodeEntry.getKey();
            List<Path> paths = nodeEntry.getValue();
            FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()]));

            // Get splits for each input path and tag with InputFormat
            // and Mapper types by wrapping in a TaggedInputSplit.
            List<InputSplit> pathSplits = format.getSplits(jobCopy);
            for (InputSplit pathSplit : pathSplits) {
                splits.add(new CrunchInputSplit(pathSplit, formatClass, nodeIndex, conf));
            }
        }
    }
    return splits;
}

From source file:com.cloudera.crunch.impl.mr.run.CrunchInputs.java

License:Apache License

public static Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> getFormatNodeMap(JobContext job) {
    Map<Class<? extends InputFormat>, Map<Integer, List<Path>>> formatNodeMap = Maps.newHashMap();
    Configuration conf = job.getConfiguration();
    for (String input : Splitter.on(RECORD_SEP).split(conf.get(RuntimeParameters.MULTI_INPUTS))) {
        List<String> fields = ImmutableList.copyOf(SPLITTER.split(input));
        Class<? extends InputFormat> inputFormatClass;
        try {/*  w  w w.  jav  a2 s. c om*/
            inputFormatClass = (Class<? extends InputFormat>) conf.getClassByName(fields.get(0));
        } catch (ClassNotFoundException e) {
            throw new RuntimeException(e);
        }
        if (!formatNodeMap.containsKey(inputFormatClass)) {
            formatNodeMap.put(inputFormatClass, Maps.<Integer, List<Path>>newHashMap());
        }
        Integer nodeIndex = Integer.valueOf(fields.get(1));
        if (!formatNodeMap.get(inputFormatClass).containsKey(nodeIndex)) {
            formatNodeMap.get(inputFormatClass).put(nodeIndex, Lists.<Path>newLinkedList());
        }
        formatNodeMap.get(inputFormatClass).get(nodeIndex).add(new Path(fields.get(2)));
    }
    return formatNodeMap;
}

From source file:com.cloudera.recordservice.examples.terasort.RecordServiceTeraInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
    PlanUtil.SplitsInfo info = PlanUtil.getSplits(context.getConfiguration(), context.getCredentials());
    if (info.schema.getNumColumns() != 1 || info.schema.getColumnInfo(0).type.typeId != Schema.Type.STRING) {
        throw new IOException("Invalid data. Expecting schema to be a single STRING.");
    }//from  w ww.j a  v a  2  s  . c  o  m
    return info.splits;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraGen.java

License:Apache License

static long getNumberOfRows(JobContext job) {
    return job.getConfiguration().getLong(NUM_ROWS, 0);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample/*from   w ww . j a va2 s. c om*/
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            @Override
            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    if (job == lastContext) {
        return lastResult;
    }//  w ww  .  ja v a2  s .c  o  m
    long t1, t2, t3;
    t1 = System.currentTimeMillis();
    lastContext = job;
    lastResult = super.getSplits(job);
    t2 = System.currentTimeMillis();
    System.out.println("Spent " + (t2 - t1) + "ms computing base-splits.");
    if (job.getConfiguration().getBoolean(TeraScheduler.USE, true)) {
        TeraScheduler scheduler = new TeraScheduler(lastResult.toArray(new FileSplit[0]),
                job.getConfiguration());
        lastResult = scheduler.getNewFileSplits();
        t3 = System.currentTimeMillis();
        System.out.println("Spent " + (t3 - t2) + "ms computing TeraScheduler splits.");
    }
    return lastResult;
}

From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java

License:Apache License

/**
 * Set the requirement for a final sync before the stream is closed.
 *//*  www . j  a v  a 2s.  c  o m*/
static void setFinalSync(JobContext job, boolean newValue) {
    job.getConfiguration().setBoolean(FINAL_SYNC_ATTRIBUTE, newValue);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java

License:Apache License

/**
 * Does the user want a final sync at close?
 *///  w  w  w  . j ava2s. c  o  m
public static boolean getFinalSync(JobContext job) {
    return job.getConfiguration().getBoolean(FINAL_SYNC_ATTRIBUTE, false);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraOutputFormat.java

License:Apache License

@Override
public void checkOutputSpecs(JobContext job) throws InvalidJobConfException, IOException {
    // Ensure that the output directory is set
    Path outDir = getOutputPath(job);
    if (outDir == null) {
        throw new InvalidJobConfException("Output directory not set in JobConf.");
    }// w w w  .  ja  va  2  s  .co  m

    // get delegation token for outDir's file system
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { outDir }, job.getConfiguration());
}