Example usage for org.apache.hadoop.mapreduce Job getInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getInputFormatClass.

Prototype

@SuppressWarnings("unchecked")
public Class<? extends InputFormat<?, ?>> getInputFormatClass() throws ClassNotFoundException

Source Link

Document

Get the InputFormat class for the job.

Usage

From source file:com.moz.fiji.mapreduce.TestFijiMapReduceJobBuilder.java

License:Apache License

@Test
public void testBuildWithXmlKVStores() throws Exception {
    // Test that we can override default configuration KeyValueStores from an XML file.
    final InputStream xmlStores = Resources.openSystemResource("com.moz.fiji/mapreduce/test-kvstores.xml");

    // This file needs to exist before we build the job, or else
    // we can't build the job; it's referenced by a key-value store that checks
    // for its presence.
    final File tmpFile = new File("/tmp/foo.seq");
    if (tmpFile.createNewFile()) {
        // We created this temp file, we're responsible for deleting it.
        tmpFile.deleteOnExit();/*from w w  w . ja  v a 2 s  .  co m*/
    }

    LOG.info("Building job...");
    final FijiMapReduceJob job = FijiMapReduceJobBuilder.create().withConf(mConf)
            .withInput(MapReduceJobInputs.newTextMapReduceJobInput(new Path("/path/to/my/input")))
            .withMapper(MyMapper.class).withReducer(MyReducer.class)
            .withOutput(MapReduceJobOutputs.newTextMapReduceJobOutput(new Path("/path/to/my/output"), 16))
            .withStoreBindings(xmlStores).build();

    xmlStores.close();

    LOG.info("Verifying job configuration...");
    final Job hadoopJob = job.getHadoopJob();
    assertEquals(TextInputFormat.class, hadoopJob.getInputFormatClass());
    assertEquals(MyMapper.class, hadoopJob.getMapperClass());
    assertEquals(MyReducer.class, hadoopJob.getReducerClass());
    assertEquals(16, hadoopJob.getNumReduceTasks());
    assertEquals(TextOutputFormat.class, hadoopJob.getOutputFormatClass());

    // KeyValueStore-specific checks here.
    // We override mapperMap with a SeqFileKeyValueStore.
    Configuration confOut = hadoopJob.getConfiguration();
    assertEquals(2, confOut.getInt(KeyValueStoreConfigSerializer.CONF_KEY_VALUE_STORE_COUNT, 0));
    assertEquals(SeqFileKeyValueStore.class.getName(),
            confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0."
                    + KeyValueStoreConfigSerializer.CONF_CLASS));
    assertEquals("mapperMap", confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "0."
            + KeyValueStoreConfigSerializer.CONF_NAME));
    assertEquals(EmptyKeyValueStore.class.getName(),
            confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "1."
                    + KeyValueStoreConfigSerializer.CONF_CLASS));
    assertEquals("reducerMap", confOut.get(KeyValueStoreConfiguration.KEY_VALUE_STORE_NAMESPACE + "1."
            + KeyValueStoreConfigSerializer.CONF_NAME));
}

From source file:com.moz.fiji.mapreduce.TestFijiProduceJobBuilder.java

License:Apache License

@Test
public void testBuildWithHFileOutput() throws ClassNotFoundException, IOException {

    final FijiMapReduceJob produceJob = FijiProduceJobBuilder.create().withConf(getConf())
            .withInputTable(mTable.getURI()).withProducer(MyProducer.class)
            .withOutput(/*from  w w w  . ja v  a2s.c  o  m*/
                    MapReduceJobOutputs.newHFileMapReduceJobOutput(mTable.getURI(), new Path("foo/bar"), 10))
            .build();

    // Verify that the MR Job was configured correctly.
    final Job job = produceJob.getHadoopJob();
    assertEquals(HBaseFijiTableInputFormat.class, job.getInputFormatClass());
    assertEquals(ProduceMapper.class, job.getMapperClass());
    assertEquals(MyProducer.class, job.getConfiguration().getClass(FijiConfKeys.FIJI_PRODUCER_CLASS, null));
    assertEquals(10, job.getNumReduceTasks());
    assertEquals(FijiHFileOutputFormat.class, job.getOutputFormatClass());
}

From source file:com.savy3.nonequijoin.MapOutputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 */// w  ww. j ava2  s . c o  m

@SuppressWarnings("unchecked")
// getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    HashMap<K, V> samples = (HashMap<K, V>) sampler.getSample(inf, job);
    LOG.info("Using " + samples.size() + " samples");

    // write the input samples in to file <partitionfile>/mapIn
    Path dstOut = new Path(TotalOrderPartitioner.getPartitionFile(conf));

    Path dst = new Path(dstOut, "mapIn");
    FileSystem fs = dst.getFileSystem(conf);
    SequenceFile.Writer sampleWriter = null;
    for (Map.Entry<K, V> sample : samples.entrySet()) {
        sampleWriter = SequenceFile.createWriter(fs, conf, dst, sample.getKey().getClass(),
                sample.getValue().getClass());
        break;
    }
    for (Map.Entry<K, V> sample : samples.entrySet()) {
        sampleWriter.append(sample.getKey(), sample.getValue());
    }
    sampleWriter.close();
    LOG.info("Sample Input File location " + dst.toString());
    // run map reduce on the samples input
    runMap(job, dst);
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Add the dependency jars as well as jars for any of the configured
 * job classes to the job configuration, so that JobClient will ship them
 * to the cluster and add them to the DistributedCache.
 */// w w  w.  j a v  a 2 s . c  o m
public static void addDependencyJars(Job job) throws IOException {
    try {
        addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class,
                com.google.protobuf.Message.class, com.google.common.collect.ImmutableSet.class,
                job.getMapOutputKeyClass(), job.getMapOutputValueClass(), job.getInputFormatClass(),
                job.getOutputKeyClass(), job.getOutputValueClass(), job.getOutputFormatClass(),
                job.getPartitionerClass(), job.getCombinerClass());
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
}

From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java

License:Apache License

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    String dir = conf.get(LindenJobConfig.INPUT_DIR, null);
    logger.info("input dir:" + dir);
    Path inputPath = new Path(StringUtils.unEscapeString(dir));
    Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR));
    String indexPath = conf.get(LindenJobConfig.INDEX_PATH);

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);/*w w  w .  j  av a  2  s  .com*/
    }
    if (fs.exists(new Path(indexPath))) {
        fs.delete(new Path(indexPath), true);
    }

    int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1);
    Shard[] shards = createShards(indexPath, numShards);

    Shard.setIndexShards(conf, shards);

    //empty trash;
    (new Trash(conf)).expunge();

    Job job = Job.getInstance(conf, "linden-hadoop-indexing");
    job.setJarByClass(LindenJob.class);
    job.setMapperClass(LindenMapper.class);
    job.setCombinerClass(LindenCombiner.class);
    job.setReducerClass(LindenReducer.class);
    job.setMapOutputKeyClass(Shard.class);
    job.setMapOutputValueClass(IntermediateForm.class);
    job.setOutputKeyClass(Shard.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(IndexUpdateOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    job.setNumReduceTasks(numShards);

    String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL);
    if (lindenSchemaFile == null) {
        throw new IOException("no schema file is found");
    }
    logger.info("Adding schema file: " + lindenSchemaFile);
    job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema"));
    String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL);
    if (lindenPropertiesFile == null) {
        throw new IOException("no linden properties file is found");
    }
    logger.info("Adding linden properties file: " + lindenPropertiesFile);
    job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties"));

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Path[] inputs = FileInputFormat.getInputPaths(job);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
        buffer.append(",");
        buffer.append(inputs[i].toString());
    }
    logger.info("mapreduce.input.dir = " + buffer.toString());
    logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString());
    logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks());
    logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS));
    logger.info("mapreduce.input.format.class = " + job.getInputFormatClass());
    logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass());
    logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR));

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed");
    }
    return 0;
}

From source file:edu.uci.ics.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final List<FileSplit> inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();
        private ContextFactory ctxFactory = new ContextFactory();

        @SuppressWarnings("unchecked")
        @Override//from   ww w. jav  a 2  s. c om
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                Job job = confFactory.getConf();
                job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                writer.open();
                InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(),
                        job.getConfiguration());
                int size = inputSplits.size();
                for (int i = 0; i < size; i++) {
                    /**
                     * read all the partitions scheduled to the current node
                     */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                         * pick an unread split to read synchronize among
                         * simultaneous partitions in the same machine
                         */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }

                        /**
                         * read the split
                         */
                        TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
                        context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                        RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                        reader.initialize(inputSplits.get(i), context);
                        while (reader.nextKeyValue() == true) {
                            parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer,
                                    inputSplits.get(i).toString());
                        }
                    }
                }
                parser.close(writer);
                writer.close();
            } catch (Exception e) {
                throw new HyracksDataException(e);
            } finally {
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

From source file:gr.ntua.h2rdf.inputFormat2.TableMapReduceUtil.java

License:Open Source License

/**
 * Add the HBase dependency jars as well as jars for any of the configured
 * job classes to the job configuration, so that JobClient will ship them
 * to the cluster and add them to the DistributedCache.
 *//*  ww  w  .  j  a v  a 2s .  c  om*/
public static void addDependencyJars(Job job) throws IOException {
    try {
        addDependencyJars(job.getConfiguration(), org.apache.zookeeper.ZooKeeper.class,
                com.google.protobuf.Message.class, job.getMapOutputKeyClass(), job.getMapOutputValueClass(),
                job.getInputFormatClass(), job.getOutputKeyClass(), job.getOutputValueClass(),
                job.getOutputFormatClass(), job.getPartitionerClass(), job.getCombinerClass());
    } catch (ClassNotFoundException e) {
        throw new IOException(e);
    }
}

From source file:hu.sztaki.ilab.bigdata.common.tools.InputSampler.java

License:Apache License

/**
 * Write a partition file for the given job, using the Sampler provided.
 * Queries the sampler for a sample keyset, sorts by the output key
 * comparator, selects the keys for each rank, and writes to the destination
 * returned from {@link TotalOrderPartitioner#getPartitionFile}.
 */// w w  w. j a  va2  s .co m
@SuppressWarnings("unchecked") // getInputFormat, getOutputKeyComparator
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler)
        throws IOException, ClassNotFoundException, InterruptedException {
    Configuration conf = job.getConfiguration();
    final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
    int numPartitions = job.getNumReduceTasks();
    K[] samples = sampler.getSample(inf, job);
    LOG.info("Using " + samples.length + " samples");
    RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
    Arrays.sort(samples, comparator);
    Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
    FileSystem fs = dst.getFileSystem(conf);
    if (fs.exists(dst)) {
        fs.delete(dst, false);
    }
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(),
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    float stepSize = samples.length / (float) numPartitions;
    int last = -1;
    for (int i = 1; i < numPartitions; ++i) {
        int k = Math.round(stepSize * i);
        while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
            ++k;
        }
        writer.append(samples[k], nullValue);
        last = k;
    }
    writer.close();
}

From source file:it.crs4.pydoop.mapreduce.pipes.CommandLineParser.java

License:Apache License

private static void setupPipesJob(Job job) throws IOException, ClassNotFoundException {
    Configuration conf = job.getConfiguration();
    // default map output types to Text
    if (!getIsJavaMapper(conf)) {
        job.setMapperClass(PipesMapper.class);
        // Save the user's partitioner and hook in our's.
        setJavaPartitioner(conf, job.getPartitionerClass());
        job.setPartitionerClass(PipesPartitioner.class);
    }//from  www  .  ja v a  2s. c o  m
    if (!getIsJavaReducer(conf)) {
        job.setReducerClass(PipesReducer.class);
        if (!getIsJavaRecordWriter(conf)) {
            job.setOutputFormatClass(NullOutputFormat.class);
        }
    }
    String textClassname = Text.class.getName();
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

    // Use PipesNonJavaInputFormat if necessary to handle progress reporting
    // from C++ RecordReaders ...
    if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
        conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
        job.setInputFormatClass(PipesNonJavaInputFormat.class);
    }

    if (avroInput != null) {
        if (explicitInputFormat) {
            conf.setClass(Submitter.INPUT_FORMAT, job.getInputFormatClass(), InputFormat.class);
        } // else let the bridge fall back to the appropriate Avro IF
        switch (avroInput) {
        case K:
            job.setInputFormatClass(PydoopAvroInputKeyBridge.class);
            break;
        case V:
            job.setInputFormatClass(PydoopAvroInputValueBridge.class);
            break;
        case KV:
            job.setInputFormatClass(PydoopAvroInputKeyValueBridge.class);
            break;
        default:
            throw new IllegalArgumentException("Bad Avro input type");
        }
    }
    if (avroOutput != null) {
        if (explicitOutputFormat) {
            conf.setClass(Submitter.OUTPUT_FORMAT, job.getOutputFormatClass(), OutputFormat.class);
        } // else let the bridge fall back to the appropriate Avro OF
        conf.set(props.getProperty("AVRO_OUTPUT"), avroOutput.name());
        switch (avroOutput) {
        case K:
            job.setOutputFormatClass(PydoopAvroOutputKeyBridge.class);
            break;
        case V:
            job.setOutputFormatClass(PydoopAvroOutputValueBridge.class);
            break;
        case KV:
            job.setOutputFormatClass(PydoopAvroOutputKeyValueBridge.class);
            break;
        default:
            throw new IllegalArgumentException("Bad Avro output type");
        }
    }

    String exec = getExecutable(conf);
    if (exec == null) {
        String msg = "No application program defined.";
        throw new IllegalArgumentException(msg);
    }
    // add default debug script only when executable is expressed as
    // <path>#<executable>
    //FIXME: this is kind of useless if the pipes program is not in c++
    if (exec.contains("#")) {
        // set default gdb commands for map and reduce task
        String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
        setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
        setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
    }
    URI[] fileCache = DistributedCache.getCacheFiles(conf);
    if (fileCache == null) {
        fileCache = new URI[1];
    } else {
        URI[] tmp = new URI[fileCache.length + 1];
        System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
        fileCache = tmp;
    }
    try {
        fileCache[0] = new URI(exec);
    } catch (URISyntaxException e) {
        String msg = "Problem parsing executable URI " + exec;
        IOException ie = new IOException(msg);
        ie.initCause(e);
        throw ie;
    }
    DistributedCache.setCacheFiles(fileCache, conf);
}

From source file:it.crs4.seal.recab.RecabTable.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    LOG.info("starting");

    RecabTableOptionParser parser = new RecabTableOptionParser();
    parser.parse(getConf(), args);/*w  w w . ja v  a2s  .  com*/

    LOG.info("Using " + parser.getNReduceTasks() + " reduce tasks");

    // must be called before creating the job, since the job
    // *copies* the Configuration.
    distributeVariantsFile(parser);

    // Create a Job using the processed conf
    Job job = new Job(getConf(), "RecabTable " + parser.getInputPaths().get(0));

    job.setJarByClass(RecabTable.class);
    job.setInputFormatClass(FormatNameMap
            .getInputFormat(job.getConfiguration().get(RecabTableOptionParser.INPUT_FORMAT_CONF, "sam")));
    LOG.info("Using input format " + job.getInputFormatClass().getName());

    // input paths
    for (Path p : parser.getInputPaths())
        FileInputFormat.addInputPath(job, p);

    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(ObservationCount.class);

    job.setCombinerClass(Combiner.class);

    job.setReducerClass(Red.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    // output
    FileOutputFormat.setOutputPath(job, parser.getOutputPath());

    // Submit the job, then poll for progress until the job is complete
    boolean result = job.waitForCompletion(true);
    if (result) {
        LOG.info("done");
        return 0;
    } else {
        LOG.fatal(this.getClass().getName() + " failed!");
        return 1;
    }
}