Example usage for org.apache.hadoop.mapreduce Job getInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInputFormatClass.

Prototype

@SuppressWarnings("unchecked")
public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException

Source Link

Document

Get the InputFormat class for the job.

Usage

From source file:mapReduceBasics.WordCount.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    GenericOptionsParser GOP = new GenericOptionsParser(conf, args);
    Configuration newConf = GOP.getConfiguration();
    String[] otherArgs = GOP.getRemainingArgs();
    System.err.println("Best of Luck");
    Job job = new Job(newConf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    Path inputPath = new Path(otherArgs[0]);
    Path outputPath = new Path(otherArgs[1]);

    FileInputFormat.addInputPath(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    FileSystem fs = FileSystem.get(conf);
    fs.delete(outputPath, true);//from  w  w w.jav  a2s.  co m

    //    job.setNumReduceTasks(3);
    System.err.println("Input Format - " + job.getInputFormatClass());
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:org.apache.blur.mapreduce.lib.BlurMapReduceUtil.java

License:Apache License

/**
 * Add the Blur dependency jars as well as jars for any of the configured job
 * classes to the job configuration, so that JobClient will ship them to the
 * cluster and add them to the DistributedCache.
 *//* w ww.  j a  va 2  s  .co  m*/
public static void addDependencyJars(Job job) throws IOException {
    try {
        addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class,
                job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(),
                job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(),
                job.getPartitionerClass(), job.getCombinerClass(), DocumentVisibility.class);
        addAllJarsInBlurLib(job.getConfiguration());
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
}

From source file:org.apache.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final List<FileSplit> inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();
        private ContextFactory ctxFactory = new ContextFactory();

        @SuppressWarnings("unchecked")
        @Override//from   w  w  w  .  j  a  va2 s  . c om
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                Job job = confFactory.getConf();
                job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(),
                        job.getConfiguration());
                int size = inputSplits.size();
                for (int i = 0; i < size; i++) {
                    /**
                     * read all the partitions scheduled to the current node
                     */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                         * pick an unread split to read synchronize among
                         * simultaneous partitions in the same machine
                         */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }

                        /**
                         * read the split
                         */
                        TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
                        context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                        RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                        reader.initialize(inputSplits.get(i), context);
                        while (reader.nextKeyValue() == true) {
                            parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer,
                                    inputSplits.get(i).toString());
                        }
                    }
                }
                parser.close(writer);
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

From source file:org.apache.jena.tdbloader4.partitioners.InputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 *///  w w w. j a  v a  2 s. c  o m
@SuppressWarnings("unchecked")
// getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    log.debug("writePartitionFile({},{})", job, sampler);
    Configuration conf = job.getConfiguration();
    @SuppressWarnings("rawtypes")
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks() / 9;
    log.debug("Number of partitions is {} for each index", numPartitions);
    K[] samples = sampler.getSample(inf, job);
    log.info("Using " + samples.length + " samples");
    writePartitionFile(samples, "GSPO", job, conf, numPartitions);
    writePartitionFile(samples, "GPOS", job, conf, numPartitions);
    writePartitionFile(samples, "GOSP", job, conf, numPartitions);
    writePartitionFile(samples, "SPOG", job, conf, numPartitions);
    writePartitionFile(samples, "POSG", job, conf, numPartitions);
    writePartitionFile(samples, "OSPG", job, conf, numPartitions);
    writePartitionFile(samples, "SPO", job, conf, numPartitions);
    writePartitionFile(samples, "POS", job, conf, numPartitions);
    writePartitionFile(samples, "OSP", job, conf, numPartitions);
}

From source file:org.apache.kudu.mapreduce.KuduTableMapReduceUtil.java

License:Apache License

/**
 * Add the Kudu dependency jars as well as jars for any of the configured
 * job classes to the job configuration, so that JobClient will ship them
 * to the cluster and add them to the DistributedCache.
 *///from  w  ww.  ja v  a 2 s  . c o m
public static void addDependencyJars(Job job) throws IOException {
    addKuduDependencyJars(job.getConfiguration());
    try {
        addDependencyJars(job.getConfiguration(),
                // when making changes here, consider also mapred.TableMapReduceUtil
                // pull job classes
                job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(),
                job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(),
                job.getPartitionerClass(), job.getCombinerClass());
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
}

From source file:org.apache.kylin.engine.mr.common.AbstractHadoopJob.java

License:Apache License

public static double getTotalMapInputMB(Job job)
        throws ClassNotFoundException, IOException, InterruptedException, JobException {
    if (job == null) {
        throw new JobException("Job is null");
    }/*ww  w  .  ja va  2s.  co  m*/

    long mapInputBytes = 0;
    InputFormat<?, ?> input = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration());
    for (InputSplit split : input.getSplits(job)) {
        mapInputBytes += split.getLength();
    }
    if (mapInputBytes == 0) {
        throw new IllegalArgumentException("Map input splits are 0 bytes, something is wrong!");
    }
    double totalMapInputMB = (double) mapInputBytes / 1024 / 1024;
    return totalMapInputMB;
}

From source file:org.broadinstitute.sting.gatk.hadoop.hadoopsrc.InputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 *///from  w  w w.  ja v  a2  s .co m
@SuppressWarnings("unchecked")
// getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    K[] samples = sampler.getSample(inf, job);
    RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();

    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
    FileSystem fs = dst.getFileSystem(conf);
    if (fs.exists(dst)) {
        fs.delete(dst, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(),
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}

From source file:org.cloudgraph.hbase.mapreduce.GraphMapReduceSetup.java

License:Apache License

/**
 * Add the HBase dependency jars as well as jars for any of the configured job
 * classes to the job configuration, so that JobClient will ship them to the
 * cluster and add them to the DistributedCache.
 *//*from www. j a va2  s.co m*/
public static void addDependencyJars(Job job) throws IOException {
    try {
        addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class,
                com.google.protobuf.Message.class, com.google.common.collect.ImmutableSet.class,
                org.apache.hadoop.hbase.util.Bytes.class, // one class from
                // hbase.jar
                job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(),
                job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(),
                job.getPartitionerClass(), job.getCombinerClass());
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
}

From source file:org.kiji.mapreduce.TestKijiBulkImportJobBuilder.java

License:Apache License

@Test
public void testBuildWithHFileOutput() throws Exception {
    final MapReduceJob mrjob = KijiBulkImportJobBuilder.create().withConf(getConf())
            .withInput(new TextMapReduceJobInput(new Path(mTempPath, "input")))
            .withBulkImporter(NoopBulkImporter.class)
            .withOutput(new HFileMapReduceJobOutput(mTable, new Path(mTempPath, "output"), 10)).build();

    final Job job = mrjob.getHadoopJob();
    assertEquals(TextInputFormat.class, job.getInputFormatClass());
    assertEquals(BulkImportMapper.class, job.getMapperClass());
    assertEquals(NoopBulkImporter.class,
            job.getConfiguration().getClass(KijiConfKeys.KIJI_BULK_IMPORTER_CLASS, null));
    assertEquals(IdentityReducer.class, job.getReducerClass());
    assertEquals(10, job.getNumReduceTasks());
    assertEquals(KijiHFileOutputFormat.class, job.getOutputFormatClass());
    assertEquals(TotalOrderPartitioner.class, job.getPartitionerClass());
}

From source file:org.kiji.mapreduce.TestKijiBulkImportJobBuilder.java

License:Apache License

@Test
public void testBuildWithKeyValueStore() throws Exception {
    final MapReduceJob mrjob = KijiBulkImportJobBuilder.create().withConf(getConf())
            .withInput(new TextMapReduceJobInput(new Path(mTempPath, "input")))
            .withBulkImporter(KVStoreBulkImporter.class)
            .withOutput(new HFileMapReduceJobOutput(mTable, new Path(mTempPath, "output"), 10)).build();

    final Job job = mrjob.getHadoopJob();
    // Verify that everything else is what we expected as in the previous test
    // (except the bulk importer class name)...
    assertEquals(TextInputFormat.class, job.getInputFormatClass());
    assertEquals(BulkImportMapper.class, job.getMapperClass());
    assertEquals(KVStoreBulkImporter.class,
            job.getConfiguration().getClass(KijiConfKeys.KIJI_BULK_IMPORTER_CLASS, null));
    assertEquals(IdentityReducer.class, job.getReducerClass());
    assertEquals(10, job.getNumReduceTasks());
    assertEquals(KijiHFileOutputFormat.class, job.getOutputFormatClass());
    assertEquals(TotalOrderPartitioner.class, job.getPartitionerClass());

    // KeyValueStore-specific checks here.
    final Configuration confOut = job.getConfiguration();
    assertEquals(1, confOut.getInt(KeyValueStoreConfigSerializer.CONF_KEY_VALUE_STORE_COUNT, 0));
    assertEquals(EmptyKeyValueStore.class.getName(),
            confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0."
                    + KeyValueStoreConfigSerializer.CONF_CLASS));
    assertEquals("foostore", confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0."
            + KeyValueStoreConfigSerializer.CONF_NAME));
}