Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.soteradefense.dga.LouvainRunner.java

License:Apache License

private int runMapreduceJob(String inputPath, String outputPath, DGAConfiguration conf) throws Exception {
    Configuration mrConf = new Configuration();
    for (Map.Entry<String, String> entry : conf.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }//from www . ja  v  a  2 s  .c  o  m

    Job job = Job.getInstance(configuration);
    job.setJarByClass(LouvainRunner.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

From source file:com.splicemachine.mrio.api.SpliceTableMapReduceUtil.java

License:Apache License

/**
 * Use this before submitting a TableMap job. It will appropriately set up
 * the job./*from   w w  w  . jav  a 2s .  co m*/
 *
 * @param table  The Splice table name to read from.
 * @param scan  The scan instance with the columns, time range etc.
 * @param mapper  The mapper class to use.
 * @param outputKeyClass  The class of the output key.
 * @param outputValueClass  The class of the output value.
 * @param job  The current job to adjust.  Make sure the passed job is
 * carrying all necessary HBase configuration.
 * @param addDependencyJars upload HBase jars and jars for any of the configured
 *           job classes via the distributed cache (tmpjars).
 * @throws IOException When setting up the details fails.
 */
public static void initTableMapperJob(String table, Scan scan, Class<? extends Mapper> mapper,
        Class<? extends WritableComparable> outputKeyClass, Class<? extends Object> outputValueClass, Job job,
        boolean addDependencyJars, Class<? extends InputFormat> inputFormatClass) throws IOException {
    job.setInputFormatClass(inputFormatClass);
    if (outputValueClass != null)
        job.setMapOutputValueClass(outputValueClass);
    if (outputKeyClass != null)
        job.setMapOutputKeyClass(outputKeyClass);
    if (mapper != null)
        job.setMapperClass(mapper);
    job.getConfiguration().set(MRConstants.SPLICE_INPUT_TABLE_NAME, table);
    job.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan));
    if (addDependencyJars) {
        addDependencyJars(job);
    }

}

From source file:com.splout.db.benchmark.IdentityJob.java

License:Apache License

@Override
public int run(String[] params) throws Exception {
    // Validate params etc
    JCommander jComm = new JCommander(this);
    jComm.setProgramName("Identity Job");
    try {//from   w  w  w.j  a v  a  2  s.c  o m
        jComm.parse(params);
    } catch (ParameterException e) {
        System.err.println(e.getMessage());
        jComm.usage();
        System.exit(-1);
    }

    Path outP = new Path(outputPath);
    HadoopUtils.deleteIfExists(FileSystem.get(conf), outP);

    if (pangoolSchema == null) {
        // Use plain Hadoop API
        Job job = new Job(conf);
        job.setInputFormatClass(TextInputFormat.class);
        FileInputFormat.setInputPaths(job, inputPath);
        FileOutputFormat.setOutputPath(job, outP);

        job.waitForCompletion(true);

    } else {
        if (groupBy == null) {
            System.err.println("If pangoolSchema is used, groupBy must also be used.");
            jComm.usage();
            System.exit(-1);
        }

        Schema schema = new Schema("sch", Fields.parse(pangoolSchema));
        Path inputP = new Path(inputPath);

        // Use Pangool API - parse CSV, etc
        TupleMRBuilder builder = new TupleMRBuilder(conf);
        TupleTextInputFormat parsingInputFormat = new TupleTextInputFormat(schema, skipHeading, false,
                separator.charAt(0), quotes.charAt(0), escape.charAt(0), FieldSelector.NONE, null);
        TupleTextOutputFormat outputFormat = new TupleTextOutputFormat(schema, false, separator.charAt(0),
                quotes.charAt(0), escape.charAt(0));

        builder.addIntermediateSchema(schema);
        builder.addInput(inputP, parsingInputFormat, new IdentityTupleMapper());
        builder.setGroupByFields(groupBy);
        builder.setOutput(outP, outputFormat, ITuple.class, NullWritable.class);
        builder.setTupleReducer(new IdentityTupleReducer());
        builder.setJarByClass(this.getClass());

        builder.createJob().waitForCompletion(true);
    }

    return 1;
}

From source file:com.splout.db.hadoop.TupleSampler.java

License:Apache License

public long sample(TablespaceSpec tablespace, Configuration hadoopConf, long sampleSize, Path outFile)
        throws TupleSamplerException {
    // 1 - Determine Input Splits
    // 2 - Launch sampling with the selected method
    // 3 - Recovering results
    List<InputSplit> splits = new ArrayList<InputSplit>();
    Map<InputSplit, InputFormat<ITuple, NullWritable>> splitToFormat = new HashMap<InputSplit, InputFormat<ITuple, NullWritable>>();
    Map<InputSplit, RecordProcessor> recordProcessorPerSplit = new HashMap<InputSplit, RecordProcessor>();
    Map<InputSplit, Map<String, String>> specificHadoopConfMap = new HashMap<InputSplit, Map<String, String>>();
    Map<InputSplit, TableSpec> splitToTableSpec = new HashMap<InputSplit, TableSpec>();
    Map<InputSplit, JavascriptEngine> splitToJsEngine = new HashMap<InputSplit, JavascriptEngine>();

    try {/*from   ww w .jav  a2 s .c  o m*/
        for (Table table : tablespace.getPartitionedTables()) {

            // Initialize JavaScript engine if needed
            JavascriptEngine jsEngine = null;
            TableSpec tableSpec = table.getTableSpec();
            if (tableSpec.getPartitionByJavaScript() != null) {
                try {
                    jsEngine = new JavascriptEngine(tableSpec.getPartitionByJavaScript());
                } catch (Throwable e) {
                    throw new RuntimeException(e);
                }
            }

            for (TableInput tableFile : table.getFiles()) {
                @SuppressWarnings("deprecation")
                Job job = new Job(hadoopConf);
                FileInputFormat.setInputPaths(job, tableFile.getPaths());
                if (options.getMaxInputSplitSize() != null) {
                    logger.info("Using max input split size: " + options.getMaxInputSplitSize());
                    FileInputFormat.setMaxInputSplitSize(job, options.getMaxInputSplitSize());
                }
                job.setInputFormatClass(FileInputFormat.class);

                if (tableFile.getSpecificHadoopInputFormatContext() != null) {
                    for (Map.Entry<String, String> specificHadoopConf : tableFile
                            .getSpecificHadoopInputFormatContext().entrySet()) {
                        job.getConfiguration().set(specificHadoopConf.getKey(), specificHadoopConf.getValue());
                    }
                }

                for (InputSplit split : tableFile.getFormat().getSplits(job)) {
                    if (tableFile.getSpecificHadoopInputFormatContext() != null) {
                        specificHadoopConfMap.put(split, tableFile.getSpecificHadoopInputFormatContext());
                    }
                    splitToFormat.put(split, tableFile.getFormat());
                    recordProcessorPerSplit.put(split, tableFile.getRecordProcessor());
                    splitToTableSpec.put(split, tableSpec);
                    splitToJsEngine.put(split, jsEngine);
                    splits.add(split);
                }
            }
        }

        long retrievedSamples;
        if (samplingType.equals(SamplingType.RANDOM)) {
            try {
                RandomSamplingOptions defOptions = (RandomSamplingOptions) options;
                // Default sampling method
                retrievedSamples = randomSampling(sampleSize, hadoopConf, outFile, splits, splitToTableSpec,
                        splitToFormat, specificHadoopConfMap, recordProcessorPerSplit, splitToJsEngine,
                        defOptions.getMaxSplitsToVisit());
            } catch (ClassCastException ef) {
                throw new RuntimeException("Invalid options class: " + options.getClass() + " Expected:"
                        + RandomSamplingOptions.class);
            }
        } else {
            // Reservoir sampling over full data
            retrievedSamples = fullScanSampling(tablespace, sampleSize, hadoopConf, outFile, splits.size());
        }
        return retrievedSamples;
    } catch (IOException e) {
        throw new TupleSamplerException(e);
    } catch (InterruptedException e) {
        throw new TupleSamplerException(e);
    }
}

From source file:com.splunk.shuttl.integration.hadoop.hbase.CSVJobFactory.java

License:Apache License

/**
 * @return the hadoopConfiguration/* w w  w  . j  av a 2 s  .  com*/
 * @throws IOException
 */
public static Job getConfiguredJob(String[] arguments) throws IOException {

    Configuration jobConfiguration = new Configuration(true);
    // Load hbase-site.xml
    HBaseConfiguration.addHbaseResources(jobConfiguration);

    jobConfiguration.set("fs.default.name", arguments[0]);
    jobConfiguration.set("mapred.job.tracker", arguments[1]);
    jobConfiguration.set(JobConfigurationConstants.FILENAME, arguments[2]);
    jobConfiguration.set(JobConfigurationConstants.OUTPUT_PATH, arguments[3]);
    jobConfiguration.set(JobConfigurationConstants.TABLE_NAME, arguments[4]);

    jobConfiguration.set(JobConfigurationConstants.COLUMN_FAMILY, "d");

    Job job = new Job(jobConfiguration, "BucketToHbase");
    job.setJarByClass(CSVMapper.class);

    job.setMapperClass(CSVMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);

    job.setInputFormatClass(TextInputFormat.class);

    return job;
}

From source file:com.springsource.insight.plugin.hadoop.WordCount.java

License:Open Source License

public int run(String[] args) throws Exception {
    String INPUT = "src/test/resources";
    String OUTPUT = "target/out";

    Configuration conf = new Configuration();
    File targetFolder = FileUtil.detectTargetFolder(getClass());
    if (targetFolder == null) {
        throw new IllegalStateException("Cannot detect target folder");
    }/*from w  w  w .j ava 2 s . c  om*/
    File tempFolder = new File(targetFolder, "temp");
    conf.set("hadoop.tmp.dir", tempFolder.getAbsolutePath());

    Job job = new Job(conf, "wordcount");
    job.setJarByClass(WordCount.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountReducer.class);
    job.setReducerClass(WordCountReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    FileUtils.deleteDirectory(new File(OUTPUT)); // delete old output data
    FileInputFormat.addInputPath(job, new Path(INPUT));
    FileOutputFormat.setOutputPath(job, new Path(OUTPUT));

    return job.waitForCompletion(true) ? 0 : -1;
}

From source file:com.streamsets.pipeline.stage.destination.mapreduce.jobtype.avroconvert.AvroConversionBaseCreator.java

License:Apache License

@Override
public Job call() throws Exception {
    // We're explicitly disabling speculative execution
    conf.set("mapreduce.map.speculative", "false");
    conf.set("mapreduce.map.maxattempts", "1");

    conf.set("mapreduce.job.user.classpath.first", "true");
    conf.set("mapreduce.task.classpath.user.precedence", "true");
    conf.set("mapreduce.task.classpath.first", "true");

    addNecessaryJarsToJob(conf);//from w  w  w. j  ava  2 s.  com

    Job job = Job.getInstance(conf);

    // IO formats
    job.setInputFormatClass(getInputFormatClass());
    job.setOutputFormatClass(NullOutputFormat.class);

    // Mapper & job output
    job.setMapperClass(getMapperClass());
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);

    // It's map only job
    job.setNumReduceTasks(0);

    // General configuration
    job.setJarByClass(getClass());

    return job;
}

From source file:com.stride.cartrek.core.hbase.RowKeyDistributorTestBase.java

License:Apache License

private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue,
        int seekIntervalMinValue, int seekIntervalMaxValue)
        throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue,
            seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = testingUtility.getConfiguration();
    Job job = new Job(conf, "testMapReduceInternal()-Job");
    job.setJarByClass(this.getClass());
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in
    // TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);/*from   www.  java 2 s . co  m*/

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);
}

From source file:com.synerzip.analytics.commoncrawl.googleads.counter.GoogleAdsCounterJob.java

License:Apache License

/**
 * Configures and submits the Map Reduce Job to Hadoop
 *///from  w  ww .  ja  v  a  2 s . c  o  m
public int run(String[] args) throws Exception {

    String inputPath = null;
    String outputPath = null;
    boolean overwrite = false;
    String s3AccessKey = null;
    String s3SecretKey = null;

    // Read the command line arguments. We're not using GenericOptionsParser
    // to prevent having to include commons.cli as a dependency.
    for (int index = 0; index < args.length; index++) {
        try {

            if (ARGNAME_INPATH.equals(args[index])) {
                inputPath = args[++index];
            } else if (ARGNAME_OUTPATH.equals(args[index])) {
                outputPath = args[++index];
            } else if (ARGNAME_S3ACCESSKEY.equals(args[index])) {
                s3AccessKey = args[++index];
            } else if (ARGNAME_S3SECRETKEY.equals(args[index])) {
                s3SecretKey = args[++index];
            } else if (ARGNAME_MAXFILES.equals(args[index])) {
                // FIXME - No use of static methods
                WarcFileFilter.setMax(Long.parseLong(args[++index]));
            } else if (ARGNAME_OVERWRITE.equals(args[index])) {
                overwrite = true;
            } else {
                LOG.warn("Unsupported argument: " + args[index]);
            }
        } catch (ArrayIndexOutOfBoundsException e) {
            usage();
            throw new IllegalArgumentException();
        }
    }

    if (inputPath == null || outputPath == null) {
        usage();
        throw new IllegalArgumentException();
    }

    if (inputPath.contains("s3n") && (s3AccessKey == null || s3SecretKey == null)) {
        usage();
        LOG.info("Please specify Access Key and Secret Key to access data on AWS S3 storage ");
        throw new IllegalArgumentException();
    }

    // Create the Hadoop job.
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    job.setJarByClass(GoogleAdsCounterJob.class);
    if (inputPath.contains("s3n") && (s3AccessKey != null && s3SecretKey != null)) {
        conf.set("AWS_ACCESS_KEY_ID", s3AccessKey);
        conf.set("AWS_SECRET_ACCESS_KEY", s3SecretKey);
    }
    // Scan the provided input path for WARC files.
    LOG.info("setting input path to '" + inputPath + "'");

    WarcFileFilter.setFilter(FILEFILTER);
    FileInputFormat.addInputPath(job, new Path(inputPath));

    // FIXME - I see the problem that you want to give a dynamic number to a
    // static class. My question is, Is this really required, if we just
    // point to a file in s3 that should solve our problem
    FileInputFormat.setInputPathFilter(job, WarcFileFilter.class);

    // Delete the output path directory if it already exists and user wants
    // to overwrite it.
    if (overwrite) {
        LOG.info("clearing the output path at '" + outputPath + "'");
        FileSystem fs = FileSystem.get(new URI(outputPath), conf);
        if (fs.exists(new Path(outputPath))) {
            fs.delete(new Path(outputPath), true);
        }
    }

    // Set the path where final output 'part' files will be saved.
    LOG.info("setting output path to '" + outputPath + "'");
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    /*
     * // Defines additional single text based output 'GoogleAdClient' for
     * the job MultipleOutputs.addNamedOutput(job, "GoogleAdClient",
     * TextOutputFormat.class, Text.class,LongWritable.class );
     * 
     * // Defines additional text based output 'GoogleAdType' for the job
     * MultipleOutputs.addNamedOutput(job,
     * "GoogleAdType",TextOutputFormat.class, Text.class,
     * LongWritable.class);
     */
    // Set which InputFormat class to use.
    job.setInputFormatClass(WARCInputFormat.class);

    // Set which OutputFormat class to use.
    job.setOutputFormatClass(TextOutputFormat.class);

    /*
     * Using MultipleOutputs creates zero-sized default output e.g.: *
     * part-r-00000. To prevent this use LazyOutputFormat instead of
     * job.setOutputFormatClass(TextOutputFormat.class) in Hadoop job
     * configuration.
     */
    // LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

    //   job.setPartitionerClass(GoogleAdsCounterPartitioner.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);
    //job.setNumReduceTasks(4);
    // Set the output data types.
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    // Set which Mapper and Reducer classes to use.
    job.setMapperClass(GoogleAdsCounterMapper.class);
    // job.setMapperClass(CrawlMapper_AdStatsDetails.class);
    job.setReducerClass(GoogleAdsCounterReducer.class);

    // set combiner
    //job.setCombinerClass(GoogleAdsCounterReducer.class);

    // set job name
    job.setJobName("CommonCrawl Data Processing : Counting Google Ads");

    long startTime = System.currentTimeMillis();
    if (job.waitForCompletion(true)) {

        LOG.info("Job completion status : " + job.waitForCompletion(true));
        long endTime = System.currentTimeMillis();

        long difference = endTime - startTime;
        LOG.info("Elapsed milliseconds: " + difference);
        Counter totalResponsePagesCounter = job.getCounters().findCounter(TestCounters.TOTALRESPONSEPAGES);
        LOG.info("totalResponsePagesCounter = " + totalResponsePagesCounter.getValue());

        Counter totalGoogleAdPagesCounter = job.getCounters().findCounter(TestCounters.TOTALGOOGLEADSPAGES);
        LOG.info("totalGoogleAdPagesCounter = " + totalGoogleAdPagesCounter.getValue());

        return 0;
    } else {
        return 1;
    }
}

From source file:com.talis.hadoop.rdf.collation.QuadsCollater.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Configuration configuration = getConf();

    boolean useCompression = configuration.getBoolean(Constants.OPTION_USE_COMPRESSION,
            Constants.OPTION_USE_COMPRESSION_DEFAULT);
    if (useCompression) {
        configuration.setBoolean("mapred.compress.map.output", true);
        configuration.set("mapred.output.compression.type", "BLOCK");
        configuration.set("mapred.map.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec");
    }//from   w  w  w .  jav  a 2  s .  c o m

    boolean overrideOutput = configuration.getBoolean(Constants.OPTION_OVERRIDE_OUTPUT,
            Constants.OPTION_OVERRIDE_OUTPUT_DEFAULT);
    FileSystem fs = FileSystem.get(new Path(args[1]).toUri(), configuration);
    if (overrideOutput) {
        fs.delete(new Path(args[1]), true);
    }

    Job job = new Job(configuration);
    job.setJobName(JOB_NAME);
    job.setJarByClass(getClass());

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    FileOutputFormat.setCompressOutput(job, true);

    job.setInputFormatClass(NQuadsInputFormat.class);
    job.setMapperClass(CollationMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(QuadWritable.class);

    job.setReducerClass(CollationReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(QuadArrayWritable.class);

    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    if (LOG.isDebugEnabled())
        Utils.log(job, LOG);

    return job.waitForCompletion(true) ? 0 : 1;
}