Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.sematext.hbase.wd.RowKeyDistributorTestBase.java

License:Apache License

private void testMapReduceInternal(long origKeyPrefix, Scan scan, int numValues, int startWithValue,
        int seekIntervalMinValue, int seekIntervalMaxValue)
        throws IOException, InterruptedException, ClassNotFoundException {
    int valuesCountInSeekInterval = writeTestData(origKeyPrefix, numValues, startWithValue,
            seekIntervalMinValue, seekIntervalMaxValue);

    // Reading data
    Configuration conf = testingUtility.getConfiguration();
    Job job = new Job(conf, "testMapReduceInternal()-Job");
    job.setJarByClass(this.getClass());
    TableMapReduceUtil.initTableMapperJob(TABLE_NAME, scan, RowCounterMapper.class,
            ImmutableBytesWritable.class, Result.class, job);

    // Substituting standard TableInputFormat which was set in TableMapReduceUtil.initTableMapperJob(...)
    job.setInputFormatClass(WdTableInputFormat.class);
    keyDistributor.addInfo(job.getConfiguration());

    job.setOutputFormatClass(NullOutputFormat.class);
    job.setNumReduceTasks(0);//from w w w. j  a va  2  s.c om

    boolean succeeded = job.waitForCompletion(true);
    Assert.assertTrue(succeeded);

    long mapInputRecords = job.getCounters().findCounter(RowCounterMapper.Counters.ROWS).getValue();
    Assert.assertEquals(valuesCountInSeekInterval, mapInputRecords);
}

From source file:com.sequenceiq.yarntest.mr.QuasiMonteCarlo.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi/* w w  w  . j a v  a 2s  .c o m*/
 */
public static JobID submitPiEstimationMRApp(String jobName, int numMaps, long numPoints, Path tmpDir,
        Configuration conf) throws IOException, ClassNotFoundException, InterruptedException {
    Job job = new Job(conf);
    //setup job conf
    job.setJobName(jobName);
    job.setJarByClass(QuasiMonteCarlo.class);

    job.setInputFormatClass(SequenceFileInputFormat.class);

    job.setOutputKeyClass(BooleanWritable.class);
    job.setOutputValueClass(LongWritable.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapperClass(QmcMapper.class);

    job.setReducerClass(QmcReducer.class);
    job.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    job.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(tmpDir, "in");
    final Path outDir = new Path(tmpDir, "out");
    FileInputFormat.setInputPaths(job, inDir);
    FileOutputFormat.setOutputPath(job, outDir);

    final FileSystem fs = FileSystem.get(conf);
    if (fs.exists(tmpDir)) {
        fs.delete(tmpDir, true);
        //      throw new IOException("Tmp directory " + fs.makeQualified(tmpDir)
        //          + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    //  try {
    //generate an input file for each map task
    for (int i = 0; i < numMaps; ++i) {
        final Path file = new Path(inDir, "part" + i);
        final LongWritable offset = new LongWritable(i * numPoints);
        final LongWritable size = new LongWritable(numPoints);
        final SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, LongWritable.class,
                LongWritable.class, CompressionType.NONE);
        try {
            writer.append(offset, size);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i);
    }

    //start a map/reduce job
    System.out.println("Starting Job");
    final long startTime = System.currentTimeMillis();
    job.submit();
    //      final double duration = (System.currentTimeMillis() - startTime)/1000.0;
    //      System.out.println("Job Finished in " + duration + " seconds");
    return job.getJobID();

    //    } finally {
    //      fs.delete(tmpDir, true);
    //    }
}

From source file:com.shmsoft.dmass.main.MRFreeEedProcess.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    // inventory dir holds all package (zip) files resulting from stage
    String projectFileName = args[0];
    String outputPath = args[1];//from  w w w  .j a v a 2s.  c om
    logger.info("Running Hadoop job");
    logger.info("Input project file = " + projectFileName);
    logger.info("Output path = " + outputPath);

    // Hadoop configuration class
    Configuration configuration = getConf();
    // No speculative execution! Do not process the same file twice
    configuration.set("mapred.reduce.tasks.speculative.execution", "false");
    // TODO even in local mode, the first argument should not be the inventory
    // but write a complete project file instead
    Project project = Project.getProject();
    if (project == null || project.isEmpty()) {
        // configure Hadoop input files
        System.out.println("Reading project file " + projectFileName);
        project = new Project().loadFromFile(new File(projectFileName));
        Project.setProject(project);
    }
    project.setProperty(ParameterProcessing.OUTPUT_DIR_HADOOP, outputPath);
    // send complete project information to all mappers and reducers
    configuration.set(ParameterProcessing.PROJECT, project.toString());

    Settings.load();
    configuration.set(ParameterProcessing.SETTINGS_STR, Settings.getSettings().toString());
    configuration.set(ParameterProcessing.METADATA_FILE,
            Files.toString(new File(ColumnMetadata.metadataNamesFile), Charset.defaultCharset()));
    Job job = new Job(configuration);
    job.setJarByClass(MRFreeEedProcess.class);
    job.setJobName("MRFreeEedProcess");

    // Hadoop processes key-value pairs
    job.setOutputKeyClass(MD5Hash.class);
    job.setOutputValueClass(MapWritable.class);

    // set map and reduce classes
    job.setMapperClass(Map.class);
    job.setReducerClass(Reduce.class);

    // Hadoop TextInputFormat class
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    //        String delim = "\u0001";
    //        configuration.set("mapred.textoutputformat.separator", delim);
    //        configuration.set("mapreduce.output.textoutputformat.separator", delim);

    logger.debug("project.isEnvHadoop() = {} ", project.isEnvHadoop());
    String inputPath = projectFileName;
    if (project.isEnvHadoop() || Settings.getSettings().isHadoopDebug()) {
        inputPath = formInputPath(project);
    }

    logger.debug("Ready to run, inputPath = {}, outputPath = {}", inputPath, outputPath);
    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    SHMcloudLogging.init(false);

    if (Settings.getSettings().isHadoopDebug()) {
        if (new File(outputPath).exists()) {
            Util.deleteDirectory(new File(outputPath));
        }
    }

    SolrIndex.getInstance().init();

    boolean success = job.waitForCompletion(true);
    if (project.isEnvHadoop() && project.isFsS3()) {
        transferResultsToS3(outputPath);
    }

    SolrIndex.getInstance().destroy();

    return success ? 0 : 1;
}

From source file:com.shopping.hbase.mapreduce.Import.java

License:Apache License

/**
 * Job configuration./*from   w  w  w . j  a  v a 2 s. c om*/
 */
protected Job configureJob(Configuration conf, String inputPathName, String tableName) throws IOException {
    Path inputPath = new Path(inputPathName);
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Importer.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    // job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(Importer.class);
    // No reducers. Just write straight to table. Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.shopping.hbase.sample.mapreduce.SampleUploader.java

License:Apache License

/**
 * Job configuration.// w w  w  .  j  a  v a2  s .co  m
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(TextInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers.  Just write straight to table.  Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.shopzilla.hadoop.mapreduce.MiniMRClusterContextMRTest.java

License:Apache License

@Test
public void testWordCount() throws Exception {
    Path input = new Path("/user/test/keywords_data");
    Path output = new Path("/user/test/word_count");

    Job job = new Job(configuration);

    job.setJobName("Word Count Test");

    job.setMapperClass(WordCountMapper.class);
    job.setReducerClass(SumReducer.class);

    job.setInputFormatClass(TextInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LongWritable.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);
    job.setNumReduceTasks(1);// w w  w .j av a  2s .  co  m
    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, output);

    assertTrue("All files from /data classpath directory should have been copied into HDFS",
            miniMRClusterContext.getFileSystem().exists(input));

    job.waitForCompletion(true);

    assertTrue("Output file should have been created", miniMRClusterContext.getFileSystem().exists(output));

    final LinkedList<String> expectedLines = new LinkedList<String>();
    expectedLines.add("goodbye\t1");
    expectedLines.add("hello\t1");
    expectedLines.add("world\t2");

    miniMRClusterContext.processData(output, new Function<String, Void>() {
        @Override
        public Void apply(String line) {
            assertEquals(expectedLines.pop(), line);
            return null;
        }
    });
    assertEquals(0, expectedLines.size());
}

From source file:com.skp.experiment.cf.als.hadoop.UploloadToHbaseTableJob.java

License:Apache License

/**
 * Job configuration./*from w  ww.j  a v a 2  s . c o  m*/
 */
public static Job configureJob(Configuration conf, String[] args) throws IOException {
    Path inputPath = new Path(args[0]);
    String tableName = args[1];
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(Uploader.class);
    FileInputFormat.setInputPaths(job, inputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setMapperClass(Uploader.class);
    // No reducers.  Just write straight to table.  Call initTableReducerJob
    // because it sets up the TableOutputFormat.
    TableMapReduceUtil.initTableReducerJob(tableName, null, job);
    job.setNumReduceTasks(0);
    return job;
}

From source file:com.sohu.rdc.inf.cdn.offline.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length < 2) {
        System.err.println("Usage: wordcount <in> [<in>...] <out>");
        System.exit(2);/*  w  ww . j a  v  a 2  s  .  c om*/
    }

    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount.class);
    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);

    job.setInputFormatClass(LzoTextInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    for (int i = 0; i < otherArgs.length - 1; ++i) {
        FileInputFormat.addInputPath(job, new Path(otherArgs[i]));
    }
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[otherArgs.length - 1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.soteradefense.dga.louvain.mapreduce.CommunityCompression.java

License:Apache License

public int run(String[] args) throws Exception {
    Configuration mrConf = this.getConf();
    for (java.util.Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
        mrConf.set(entry.getKey(), entry.getValue());
    }/*from  w  ww. j a  va 2s  .  c om*/

    Job job = Job.getInstance(mrConf);
    job.setJarByClass(CommunityCompression.class);
    Path in = new Path(inputPath);
    Path out = new Path(outputPath);

    FileInputFormat.setInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, out);
    job.setJobName("CommunityCompression");

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(LouvainVertexWritable.class);

    job.setMapperClass(CommunityCompression.Map.class);
    job.setReducerClass(CommunityCompression.Reduce.class);

    logger.debug("Running Mapreduce step with job configuration: {}", job);

    return job.waitForCompletion(false) ? 0 : 1;
}

From source file:com.soteradefense.dga.louvain.mapreduce.LouvainTableSynthesizer.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Job job = null;
    try {/* ww w . j  a  v a 2s  .co  m*/
        int iteration = 0;
        if (!basePath.endsWith("/"))
            basePath = basePath + "/";
        String inputPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
        String joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1);
        String outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
        Configuration mrConf = this.getConf();
        job = Job.getInstance(mrConf);

        for (Map.Entry<String, String> entry : dgaConfiguration.getSystemProperties().entrySet()) {
            mrConf.set(entry.getKey(), entry.getValue());
        }

        FileSystem fs = FileSystem.get(job.getConfiguration());
        boolean nextFileExists = fs.exists(new Path(joinPath));
        while (nextFileExists) {
            System.out.println("Processing " + inputPath + " and " + joinPath);
            job = Job.getInstance(mrConf);
            job.setJobName("Louvain Table Synthesizer " + iteration);

            job.setJarByClass(LouvainTableSynthesizer.class);

            job.setMapperClass(LouvainTableSynthesizerMapper.class);
            job.setReducerClass(LouvainTableSynthesizerReducer.class);

            job.setInputFormatClass(TextInputFormat.class);
            job.setOutputFormatClass(TextOutputFormat.class);

            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(Text.class);

            //Reducer Output
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(NullWritable.class);

            //Add both input folders
            Path in = new Path(inputPath);
            Path joinIn = new Path(joinPath);
            Path out = new Path(outputPath);
            FileInputFormat.addInputPath(job, in);
            FileInputFormat.addInputPath(job, joinIn);
            FileOutputFormat.setOutputPath(job, out);

            job.waitForCompletion(true);
            //Set the new temp input path
            inputPath = outputPath;
            iteration++;
            outputPath = basePath + TABLE_BASE_NAME + FILE_NAME_SEPARATOR + iteration;
            joinPath = basePath + GIRAPH_FOLDER_BASE_NAME + FILE_NAME_SEPARATOR + (iteration + 1);
            nextFileExists = fs.exists(new Path(joinPath));
        }

    } catch (IOException e) {
        e.printStackTrace();
        return -1;
    } catch (InterruptedException e) {
        e.printStackTrace();
        return -1;
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
        return -1;
    }
    return 0;
}