Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

List of usage examples for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException 

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.soteradefense.dga.LouvainRunner.java

License:Apache License

private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception {
    Configuration mrConf = new Configuration();
    for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }//from www . ja  v  a  2 s  .c  o  m

    Job job = Job.getInstance(configuration);
    job.setJarByClass(LouvainRunner.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Use this before submitting a TableMap job. It will appropriately set up
 * the job./*from   w w  w  . jav  a 2s .  co m*/
 *
 * @param table  The Splice table name to read from.
 * @param scan  The scan instance with the columns, time range etc.
 * @param mapper  The mapper class to use.
 * @param outputKeyClass  The class of the output key.
 * @param outputValueClass  The class of the output value.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary HBase configuration.
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When setting up the details fails.
 */
public static void initTableMapperJob(String table, Scan scan, Class<? extends Mapper> mapper,
        Class<? extends WritableComparable> outputKeyClass, Class<? extends Object> outputValueClass, Job job,
        boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass) throws IOException {
    job.setInputFormatClass(inputFormatClass);
    if (outputValueClass != null)
        job.setMapOutputValueClass(outputValueClass);
    if (outputKeyClass != null)
        job.setMapOutputKeyClass(outputKeyClass);
    if (mapper != null)
        job.setMapperClass(mapper);
    job.getConfiguration().set(MRConstants.SPLICE_INPUT_TABLE_NAME, table);
    job.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan));
    if (addDependencyJars) {
        addDependencyJars(job);
    }

}

From source file:com.splout.db.benchmark.IdentityJob.java

License:Apache License

@Override
public int run(String[] params) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Identity Job");
    try {//from   w  w  w.j  a v  a  2  s.c  o m
        jComm.parse(params);
    } catch (ParameterException e) {
        System.err.println(e.getMessage());
        jComm.usage();
        System.exit(-1);
    }

    Path outP = new Path(outputPath);
    HadoopUtils.deleteIfExists(FileSystem.get(conf), outP);

    if (pangoolSchema == null) {
        // Use plain Hadoop API
        Job job = new Job(conf);
        job.setInputFormatClass(TextInputFormat.class);
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outP);

        job.waitForCompletion(true);

    } else {
        if (groupBy == null) {
            System.err.println("If pangoolSchema is used, groupBy must also be used.");
            jComm.usage();
            System.exit(-1);
        }

        Schema schema = new Schema("sch", Fields.parse(pangoolSchema));
        Path inputP = new Path(inputPath);

        // Use Pangool API - parse CSV, etc
        TupleMRBuilder builder = new TupleMRBuilder(conf);
        TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false,
                separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null);
        TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0),
                quotes.charAt(0), escape.charAt(0));

        builder.addIntermediateSchema(schema);
        builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper());
        builder.setGroupByFields(groupBy);
        builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class);
        builder.setTupleReducer(new IdentityTupleReducer());
        builder.setJarByClass(this.getClass());

        builder.createJob().waitForCompletion(true);
    }

    return 1;
}

From source file:com.splout.db.hadoop.TupleSampler.java

License:Apache License

public long sample(TablespaceSpec tablespace, Configuration hadoopConf, long sampleSize, Path outFile)
        throws TupleSamplerException {
    // 1 - Determine Input Splits
    // 2 - Launch sampling with the selected method
    // 3 - Recovering results
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat = new HashMap<InputSplit, InputFormat<ITuple, NullWritable>>();
    Map<InputSplit, RecordProcessor> recordProcessorPerSplit = new HashMap<InputSplit, RecordProcessor>();
    Map<InputSplit, Map<String, String>> specificHadoopConfMap = new HashMap<InputSplit, Map<String, String>>();
    Map<InputSplit, TableSpec> splitToTableSpec = new HashMap<InputSplit, TableSpec>();
    Map<InputSplit, JavascriptEngine> splitToJsEngine = new HashMap<InputSplit, JavascriptEngine>();

    try {/*from   ww w .jav  a2 s .c  o m*/
        for (Table table : tablespace.getPartitionedTables()) {

            // Initialize JavaScript engine if needed
            JavascriptEngine jsEngine = null;
            TableSpec tableSpec = table.getTableSpec();
            if (tableSpec.getPartitionByJavaScript() != null) {
                try {
                    jsEngine = new JavascriptEngine(tableSpec.getPartitionByJavaScript());
                } catch (Throwable e) {
                    throw new RuntimeException(e);
                }
            }

            for (TableInput tableFile : table.getFiles()) {
                @SuppressWarnings("deprecation")
                Job job = new Job(hadoopConf);
                FileInputFormat.setInputPaths(job, tableFile.getPaths());
                if (options.getMaxInputSplitSize() != null) {
                    logger.info("Using max input split size: " + options.getMaxInputSplitSize());
                    FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize());
                }
                job.setInputFormatClass(FileInputFormat.class);

                if (tableFile.getSpecificHadoopInputFormatContext() != null) {
                    for (Map.Entry<String, String> specificHadoopConf : tableFile
                            .getSpecificHadoopInputFormatContext().entrySet()) {
                        job.getConfiguration().set(specificHadoopConf.getKey(), specificHadoopConf.getValue());
                    }
                }

                for (InputSplit split : tableFile.getFormat().getSplits(job)) {
                    if (tableFile.getSpecificHadoopInputFormatContext() != null) {
                        specificHadoopConfMap.put(split, tableFile.getSpecificHadoopInputFormatContext());
                    }
                    splitToFormat.put(split, tableFile.getFormat());
                    recordProcessorPerSplit.put(split, tableFile.getRecordProcessor());
                    splitToTableSpec.put(split, tableSpec);
                    splitToJsEngine.put(split, jsEngine);
                    splits.add(split);
                }
            }
        }

        long retrievedSamples;
        if (samplingType.equals(SamplingType.RANDOM)) {
            try {
                RandomSamplingOptions defOptions = (RandomSamplingOptions) options;
                // Default sampling method
                retrievedSamples = randomSampling(sampleSize, hadoopConf, outFile, splits, splitToTableSpec,
                        splitToFormat, specificHadoopConfMap, recordProcessorPerSplit, splitToJsEngine,
                        defOptions.getMaxSplitsToVisit());
            } catch (ClassCastException ef) {
                throw new RuntimeException("Invalid options class: " + options.getClass() + " Expected:"
                        + RandomSamplingOptions.class);
            }
        } else {
            // Reservoir sampling over full data
            retrievedSamples = fullScanSampling(tablespace, sampleSize, hadoopConf, outFile, splits.size());
        }
        return retrievedSamples;
    } catch (IOException e) {
        throw new TupleSamplerException(e);
    } catch (InterruptedException e) {
        throw new TupleSamplerException(e);
    }
}

From source file:com.splunk.shuttl.integration.hadoop.hbase.CSVJobFactory.java

License:Apache License

/**
 * @return the hadoopConfiguration/* w w  w  . j  av a 2 s  .  com*/
 * @throws IOException
 */
public static Job getConfiguredJob(String[] arguments) throws IOException {

    Configuration jobConfiguration = new Configuration(true);
    // Load hbase-site.xml
    HBaseConfiguration.addHbaseResources(jobConfiguration);

    jobConfiguration.set("fs.default.name", arguments[0]);
    jobConfiguration.set("mapred.job.tracker", arguments[1]);
    jobConfiguration.set(JobConfigurationConstants.FILENAME, arguments[2]);
    jobConfiguration.set(JobConfigurationConstants.OUTPUT_PATH, arguments[3]);
    jobConfiguration.set(JobConfigurationConstants.TABLE_NAME, arguments[4]);

    jobConfiguration.set(JobConfigurationConstants.COLUMN_FAMILY, "d");

    Job job = new Job(jobConfiguration, "BucketToHbase");
    job.setJarByClass(CSVMapper.class);

    job.setMapperClass(CSVMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);

    job.setInputFormatClass(TextInputFormat.class);

    return job;
}

From source file:com.springsource.insight.plugin.hadoop.WordCount.java

License:Open Source License

public int run(String[] args) throws Exception {
    String INPUT = "src/test/resources";
    String OUTPUT = "target/out";

    Configuration conf = new Configuration();
    File targetFolder = FileUtil.detectTargetFolder(getClass());
    if (targetFolder == null) {
        throw new IllegalStateException("Cannot detect target folder");
    }/*from w  w  w .j ava 2 s . c  om*/
    File tempFolder = new File(targetFolder, "temp");
    conf.set("hadoop.tmp.dir", tempFolder.getAbsolutePath());

    Job job = new Job(conf, "wordcount");
    job.setJarByClass(WordCount.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountReducer.class);
    job.setReducerClass(WordCountReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileUtils.deleteDirectory(new File(OUTPUT)); // delete old output data
    FileInputFormat.addInputPath(job, new Path(INPUT));
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionBaseCreator.java

License:Apache License

@Override
public Job call() throws Exception {
    // We're explicitly disabling speculative execution
    conf.set("mapreduce.map.speculative", "false");
    conf.set("mapreduce.map.maxattempts", "1");

    conf.set("mapreduce.job.user.classpath.first", "true");
    conf.set("mapreduce.task.classpath.user.precedence", "true");
    conf.set("mapreduce.task.classpath.first", "true");

    addNecessaryJarsToJob(conf);//from w  w  w. j  ava  2 s.  com

    Job job = Job.getInstance(conf);

    // IO formats
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(NullOutputFormat.class);

    // Mapper & job output
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // It's map only job
    job.setNumReduceTasks(0);

    // General configuration
    job.setJarByClass(getClass());

    return job;
}

From source file:com.stride.cartrek.core.hbase.RowKeyDistributorTestBase.java

License:Apache License

private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue,
        int seekIntervalMinValue, int seekIntervalMaxValue)
        throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue,
            seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = testingUtility.getConfiguration();
    Job job = new Job(conf, "testMapReduceInternal()-Job");
    job.setJarByClass(this.getClass());
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in
    // TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);/*from   www.  java 2 s . co  m*/

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);
}

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java

License:Apache License

/**
 * Configures and submits the Map Reduce Job to Hadoop
 *///from  w  ww .  ja  v  a  2 s . c  o  m
public int run(String[] args) throws Exception {

    String inputPath = null;
    String outputPath = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments. We're not using GenericOptionsParser
    // to prevent having to include commons.cli as a dependency.
    for (int index = 0; index < args.length; index++) {
        try {

            if (ARGNAME_INPATH.equals(args[index])) {
                inputPath = args[++index];
            } else if (ARGNAME_OUTPATH.equals(args[index])) {
                outputPath = args[++index];
            } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) {
                s3AccessKey = args[++index];
            } else if (ARGNAME_S3SECRETKEY.equals(args[index])) {
                s3SecretKey = args[++index];
            } else if (ARGNAME_MAXFILES.equals(args[index])) {
                // FIXME - No use of static methods
                WarcFileFilter.setMax(Long.parseLong(args[++index]));
            } else if (ARGNAME_OVERWRITE.equals(args[index])) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[index]);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            usage();
            throw new IllegalArgumentException();
        }
    }

    if (inputPath == null || outputPath == null) {
        usage();
        throw new IllegalArgumentException();
    }

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        usage();
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();
    }

    // Create the Hadoop job.
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    job.setJarByClass(GoogleAdsCounterJob.class);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
    }
    // Scan the provided input path for WARC files.
    LOG.info("setting input path to '" + inputPath + "'");

    WarcFileFilter.setFilter(FILEFILTER);
    FileInputFormat.addInputPath(job, new Path(inputPath));

    // FIXME - I see the problem that you want to give a dynamic number to a
    // static class. My question is, Is this really required, if we just
    // point to a file in s3 that should solve our problem
    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);
        }
    }

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    /*
     * // Defines additional single text based output 'GoogleAdClient' for
     * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient",
     * TextOutputFormat.class, Text.class,LongWritable.class );
     * 
     * // Defines additional text based output 'GoogleAdType' for the job
     * MultipleOutputs.addNamedOutput(job,
     * "GoogleAdType",TextOutputFormat.class, Text.class,
     * LongWritable.class);
     */
    // Set which InputFormat class to use.
    job.setInputFormatClass(WARCInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormatClass(TextOutputFormat.class);

    /*
     * Using MultipleOutputs creates zero-sized default output e.g.: *
     * part-r-00000. To prevent this use LazyOutputFormat instead of
     * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job
     * configuration.
     */
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //   job.setPartitionerClass(GoogleAdsCounterPartitioner.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    //job.setNumReduceTasks(4);
    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(GoogleAdsCounterMapper.class);
    // job.setMapperClass(CrawlMapper_AdStatsDetails.class);
    job.setReducerClass(GoogleAdsCounterReducer.class);

    // set combiner
    //job.setCombinerClass(GoogleAdsCounterReducer.class);

    // set job name
    job.setJobName("CommonCrawl Data Processing : Counting Google Ads");

    long startTime = System.currentTimeMillis();
    if (job.waitForCompletion(true)) {

        LOG.info("Job completion status : " + job.waitForCompletion(true));
        long endTime = System.currentTimeMillis();

        long difference = endTime - startTime;
        LOG.info("Elapsed milliseconds: " + difference);
        Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES);
        LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue());

        Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES);
        LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue());

        return 0;
    } else {
        return 1;
    }
}

From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//from   w  w  w .  jav  a 2  s .  c o m

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    FileOutputFormat.setCompressOutput(job, true);

    job.setInputFormatClass(NQuadsInputFormat.class);
    job.setMapperClass(CollationMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(QuadWritable.class);

    job.setReducerClass(CollationReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : 1;
}