Example usage for org.apache.hadoop.mapreduce Job setInputFormatClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job setInputFormatClass.

Prototype

public void setInputFormatClass(Class<? extends InputFormat> cls) throws IllegalStateException

Source Link

Document

Set the InputFormat for the job.

Usage

From source file:com.accumulobook.advanced.mapreduce.MapReduceFilesExample.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(this.getConf());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCount.WordCountMapper.class);
    job.setCombinerClass(WordCount.WordCountCombiner.class);
    job.setReducerClass(WordCount.WordCountReducer.class);

    // clone the articles table
    ZooKeeperInstance inst = new ZooKeeperInstance(args[0], args[1]);
    Connector conn = inst.getConnector(args[2], new PasswordToken(args[3]));

    conn.tableOperations().clone(WikipediaConstants.ARTICLES_TABLE, WikipediaConstants.ARTICLES_TABLE_CLONE,
            true, Collections.EMPTY_MAP, Collections.EMPTY_SET);

    // take cloned table offline, waiting until the operation is complete
    boolean wait = true;
    conn.tableOperations().offline(WikipediaConstants.ARTICLES_TABLE_CLONE, wait);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    // input/*from   ww  w  .  j av  a  2s.c  o  m*/
    job.setInputFormatClass(AccumuloInputFormat.class);
    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE_CLONE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // configure to use underlying RFiles
    AccumuloInputFormat.setOfflineTableScan(job, true);

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig bwConfig = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, bwConfig);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.waitForCompletion(true);
    //job.submit();

    return 0;
}

From source file:com.accumulobook.advanced.mapreduce.WordCount.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    Job job = Job.getInstance(new Configuration());
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    job.setMapperClass(WordCountMapper.class);
    job.setCombinerClass(WordCountCombiner.class);
    job.setReducerClass(WordCountReducer.class);

    // input//from   w  ww .  ja  v a 2s. c o m
    job.setInputFormatClass(AccumuloInputFormat.class);

    ClientConfiguration zkiConfig = new ClientConfiguration().withInstance(args[0]).withZkHosts(args[1]);

    AccumuloInputFormat.setInputTableName(job, WikipediaConstants.ARTICLES_TABLE);
    List<Pair<Text, Text>> columns = new ArrayList<>();
    columns.add(new Pair(WikipediaConstants.CONTENTS_FAMILY_TEXT, new Text("")));

    AccumuloInputFormat.fetchColumns(job, columns);
    AccumuloInputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloInputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));

    // output
    job.setOutputFormatClass(AccumuloOutputFormat.class);

    BatchWriterConfig config = new BatchWriterConfig();

    AccumuloOutputFormat.setBatchWriterOptions(job, config);
    AccumuloOutputFormat.setZooKeeperInstance(job, zkiConfig);
    AccumuloOutputFormat.setConnectorInfo(job, args[2], new PasswordToken(args[3]));
    AccumuloOutputFormat.setDefaultTableName(job, WikipediaConstants.WORD_COUNT_TABLE);
    AccumuloOutputFormat.setCreateTables(job, true);

    job.setJarByClass(WordCount.class);

    job.submit();
    return 0;
}

From source file:com.aerospike.hadoop.examples.aggregateintinput.AggregateIntInput.java

License:Apache License

public int run(final String[] args) throws Exception {
    final Configuration conf = getConf();

    @SuppressWarnings("deprecation")
    final Job job = new Job(conf, "AerospikeAggregateIntInput");

    log.info("run starting on bin " + binName);

    job.setJarByClass(AggregateIntInput.class);
    job.setInputFormatClass(AerospikeInputFormat.class);
    job.setMapperClass(Map.class);
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(LongWritable.class);
    // job.setCombinerClass(Reduce.class); // no combiner
    job.setReducerClass(Reduce.class);
    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(Text.class);

    FileOutputFormat.setOutputPath(job, new Path(args[0]));

    int status = job.waitForCompletion(true) ? 0 : 1;
    log.info("run finished, status=" + status);
    return status;
}

From source file:com.ailk.oci.ocnosql.tools.load.csvbulkload.CsvBulkLoadTool.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    HBaseConfiguration.addHbaseResources(getConf());
    Configuration conf = getConf();
    String quorum = conf.get("hbase.zookeeper.quorum");
    String clientPort = conf.get("hbase.zookeeper.property.clientPort");
    LOG.info("hbase.zookeeper.quorum=" + quorum);
    LOG.info("hbase.zookeeper.property.clientPort=" + clientPort);
    LOG.info("phoenix.query.dateFormat=" + conf.get("phoenix.query.dateFormat"));

    CommandLine cmdLine = null;//from www.ja v a 2s.c o  m
    try {
        cmdLine = parseOptions(args);
        LOG.info("JdbcUrl=" + getJdbcUrl(quorum + ":" + clientPort));
    } catch (IllegalStateException e) {
        printHelpAndExit(e.getMessage(), getOptions());
    }
    Class.forName(DriverManager.class.getName());
    Connection conn = DriverManager.getConnection(getJdbcUrl(quorum + ":" + clientPort));
    String tableName = cmdLine.getOptionValue(TABLE_NAME_OPT.getOpt());
    String schemaName = cmdLine.getOptionValue(SCHEMA_NAME_OPT.getOpt());
    String qualifiedTableName = getQualifiedTableName(schemaName, tableName);
    List<ColumnInfo> importColumns = buildImportColumns(conn, cmdLine, qualifiedTableName);

    LOG.info("tableName=" + tableName);
    LOG.info("schemaName=" + schemaName);
    LOG.info("qualifiedTableName=" + qualifiedTableName);

    configureOptions(cmdLine, importColumns, getConf());

    try {
        validateTable(conn, schemaName, tableName);
    } finally {
        conn.close();
    }

    Path inputPath = new Path(cmdLine.getOptionValue(INPUT_PATH_OPT.getOpt()));
    Path outputPath = null;
    if (cmdLine.hasOption(OUTPUT_PATH_OPT.getOpt())) {
        outputPath = new Path(cmdLine.getOptionValue(OUTPUT_PATH_OPT.getOpt()));
    } else {
        outputPath = new Path("/tmp/" + UUID.randomUUID());
    }
    LOG.info("Configuring HFile output path to {}", outputPath);

    Job job = new Job(getConf(),
            "Phoenix MapReduce import for " + getConf().get(PhoenixCsvToKeyValueMapper.TABLE_NAME_CONFKEY));

    // Allow overriding the job jar setting by using a -D system property at startup
    if (job.getJar() == null) {
        job.setJarByClass(PhoenixCsvToKeyValueMapper.class);
    }
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.addInputPath(job, inputPath);

    FileSystem.get(getConf());
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setMapperClass(PhoenixCsvToKeyValueMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);

    HTable htable = new HTable(getConf(), qualifiedTableName);

    // Auto configure partitioner and reducer according to the Main Data table
    HFileOutputFormat.configureIncrementalLoad(job, htable);

    LOG.info("Running MapReduce import job from {} to {}", inputPath, outputPath);
    boolean success = job.waitForCompletion(true);
    if (!success) {
        LOG.error("Import job failed, check JobTracker for details");
        return 1;
    }

    LOG.info("Loading HFiles from {}", outputPath);
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(getConf());
    loader.doBulkLoad(outputPath, htable);
    htable.close();

    LOG.info("Incremental load complete");

    LOG.info("Removing output directory {}", outputPath);
    if (!FileSystem.get(getConf()).delete(outputPath, true)) {
        LOG.error("Removing output directory {} failed", outputPath);
    }

    return 0;
}

From source file:com.ailk.oci.ocnosql.tools.load.mutiple.MutipleColumnImportTsv.java

License:Apache License

/**
 * Sets up the actual job.// w  ww.  java 2 s  .com
 *
 * @param conf  The current configuration.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String tableName, String inputPath,
        String tmpOutputPath) throws IOException, ClassNotFoundException {

    // Support non-XML supported characters
    // by re-encoding the passed separator as a Base64 string.
    String actualSeparator = conf.get(CommonConstants.SEPARATOR);
    if (actualSeparator != null) {
        conf.set(CommonConstants.SEPARATOR, Base64.encodeBytes(actualSeparator.getBytes()));
    }
    String tableNameConf = conf.get(CommonConstants.TABLE_NAME);
    if (tableNameConf == null) {
        conf.set(CommonConstants.TABLE_NAME, tableName);
    }

    // See if a non-default Mapper was set
    String mapperClassName = conf.get(MAPPER_CONF_KEY);
    Class mapperClass = mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER;

    Path inputDir = new Path(inputPath);
    Job job = new Job(conf, NAME + "_" + tableName);
    job.setJarByClass(MutipleColumnImportTsv.class);
    FileInputFormat.setInputPaths(job, inputDir);

    //??Dimporttsv.inputFormatInputFormat,TextInputFormat
    String inputFmtName = conf.get(CommonConstants.INPUTFORMAT,
            "org.apache.hadoop.mapreduce.lib.input.TextInputFormat");
    LOG.info(CommonConstants.INPUTFORMAT + " is " + inputFmtName);
    Class<? extends InputFormat> inputFmtClass = Class.forName(inputFmtName).asSubclass(InputFormat.class);
    job.setInputFormatClass(inputFmtClass);
    job.setMapperClass(mapperClass);

    String hfileOutPath = tmpOutputPath;
    if (hfileOutPath != null) {
        if (!doesTableExist(tableName)) {
            createTable(conf, tableName);
        }
        HTable table = new HTable(conf, tableName);
        //      job.setReducerClass(MutipleColumnReducer.class);
        Path outputDir = new Path(hfileOutPath);
        FileOutputFormat.setOutputPath(job, outputDir);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(Put.class);
        HFileOutputFormat.configureIncrementalLoad(job, table);
    } else {
        // No reducers.  Just write straight to table.  Call initTableReducerJob
        // to set up the TableOutputFormat.
        TableMapReduceUtil.initTableReducerJob(tableName, null, job);
        job.setNumReduceTasks(0);
    }

    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
            com.google.common.base.Function.class /* Guava used by TsvParser */);
    return job;
}

From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java

License:Apache License

/**
 * Sets up the actual job. importtsvmapreduce job
 *
 * @param conf  The current configuration.
 * @return The newly created job.//from w w  w.  j  a v  a 2 s.c o  m
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String tableName, String inputPath,
        String tmpOutputPath) throws IOException, ClassNotFoundException {

    // Support non-XML supported characters
    // by re-encoding the passed separator as a Base64 string.
    //???BASE64?
    String actualSeparator = conf.get(CommonConstants.SEPARATOR);
    if (actualSeparator != null) {
        conf.set(CommonConstants.SEPARATOR, Base64.encodeBytes(actualSeparator.getBytes()));
    }

    // See if a non-default Mapper was set?mapper?SingleColumnImporterMapper
    String mapperClassName = conf.get(MAPPER_CONF_KEY);
    Class mapperClass = mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER;

    Path inputDir = new Path(inputPath);
    //?job
    Job job = new Job(conf, NAME + "_" + tableName);
    //Set the Jar by finding where a given class came from.
    job.setJarByClass(SingleColumnImportTsv.class);
    //
    FileInputFormat.setInputPaths(job, inputDir);
    //jobinputformat

    //??Dimporttsv.inputFormatInputFormat,TextInputFormat
    //??Dimporttsv.inputFormatInputFormat,TextInputFormat
    String inputFmtName = conf.get(CommonConstants.INPUTFORMAT,
            "org.apache.hadoop.mapreduce.lib.input.TextInputFormat");
    LOG.info(CommonConstants.INPUTFORMAT + " is " + inputFmtName);
    Class<? extends InputFormat> inputFmtClass = Class.forName(inputFmtName).asSubclass(InputFormat.class);
    job.setInputFormatClass(inputFmtClass);
    job.setMapperClass(mapperClass);

    //mapper
    job.setMapperClass(mapperClass);

    String hfileOutPath = tmpOutputPath;
    if (hfileOutPath != null) {
        //?
        if (!doesTableExist(tableName)) {
            createTable(conf, tableName);
        }
        HTable table = new HTable(conf, tableName);
        //reducer
        job.setReducerClass(SingleColumnReducer.class);

        Path outputDir = new Path(hfileOutPath);
        //
        FileOutputFormat.setOutputPath(job, outputDir);
        job.setMapOutputKeyClass(ImmutableBytesWritable.class);
        job.setMapOutputValueClass(TextArrayWritable.class);
        //job?partition?outputformat?reduce
        configureIncrementalLoad(job, table);

    } else {//put
        // No reducers.  Just write straight to table.  Call initTableReducerJob
        // to set up the TableOutputFormat.
        TableMapReduceUtil.initTableReducerJob(tableName, null, job);
        job.setNumReduceTasks(0);
    }

    TableMapReduceUtil.addDependencyJars(job);
    TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
            com.google.common.base.Function.class /* Guava used by TsvParser */);
    return job;
}

From source file:com.alectenharmsel.research.LcCounters.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 2) {
        System.err.println("Usage: LineCounter <input> <output>");
        System.exit(-1);//from   ww  w.j a  va2 s. c om
    }

    Job job = new Job(getConf(), "LineCount");
    job.setJarByClass(LineCount.class);

    job.setInputFormatClass(WholeBlockInputFormat.class);

    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setMapperClass(LineCountMapper.class);
    job.setReducerClass(LineCountReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(LongWritable.class);

    Configuration check = job.getConfiguration();
    boolean success = job.waitForCompletion(true);

    //Get the counter here, output to a file called total in the dir
    Counters counters = job.getCounters();

    //Throw it in the file
    Path outPath = new Path(args[1]);
    FileSystem fs = outPath.getFileSystem(check);
    OutputStream out = fs.create(new Path(outPath, "total"));
    String total = counters.findCounter(LcCounters.NUM_LINES).getValue() + "\n";
    out.write(total.getBytes());
    out.close();
    return success ? 0 : 1;
}

From source file:com.alexholmes.hadooputils.combine.avro.mapreduce.CombineAvroKeyValueInputFormatTest.java

License:Apache License

@Test
public void testKeyValueInput() throws ClassNotFoundException, IOException, InterruptedException {
    // Create a test input file.
    File inputFile = createInputFile();

    // Configure the job input.
    Job job = new Job();
    FileInputFormat.setInputPaths(job, new Path(inputFile.getAbsolutePath()));
    job.setInputFormatClass(CombineAvroKeyValueInputFormat.class);
    AvroJob.setInputKeySchema(job, Schema.create(Schema.Type.INT));
    AvroJob.setInputValueSchema(job, Schema.create(Schema.Type.STRING));

    // Configure a mapper.
    job.setMapperClass(IndexMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    // Configure a reducer.
    job.setReducerClass(IndexReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(AvroValue.class);
    AvroJob.setOutputValueSchema(job, Schema.createArray(Schema.create(Schema.Type.INT)));

    // Configure the output format.
    job.setOutputFormatClass(AvroKeyValueOutputFormat.class);
    Path outputPath = new Path(mTempDir.getRoot().getPath(), "out-index");
    FileOutputFormat.setOutputPath(job, outputPath);

    // Run the job.
    assertTrue(job.waitForCompletion(true));

    // Verify that the output Avro container file as the expected data.
    File avroFile = new File(outputPath.toString(), "part-r-00000.avro");
    DatumReader<GenericRecord> datumReader = new SpecificDatumReader<GenericRecord>(AvroKeyValue
            .getSchema(Schema.create(Schema.Type.STRING), Schema.createArray(Schema.create(Schema.Type.INT))));
    DataFileReader<GenericRecord> avroFileReader = new DataFileReader<GenericRecord>(avroFile, datumReader);
    assertTrue(avroFileReader.hasNext());

    AvroKeyValue<CharSequence, List<Integer>> appleRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());/*  ww w. j  a v a2  s  . c  om*/
    assertNotNull(appleRecord.get());
    assertEquals("apple", appleRecord.getKey().toString());
    List<Integer> appleDocs = appleRecord.getValue();
    assertEquals(3, appleDocs.size());
    assertTrue(appleDocs.contains(1));
    assertTrue(appleDocs.contains(2));
    assertTrue(appleDocs.contains(3));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> bananaRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());
    assertNotNull(bananaRecord.get());
    assertEquals("banana", bananaRecord.getKey().toString());
    List<Integer> bananaDocs = bananaRecord.getValue();
    assertEquals(2, bananaDocs.size());
    assertTrue(bananaDocs.contains(1));
    assertTrue(bananaDocs.contains(2));

    assertTrue(avroFileReader.hasNext());
    AvroKeyValue<CharSequence, List<Integer>> carrotRecord = new AvroKeyValue<CharSequence, List<Integer>>(
            avroFileReader.next());
    assertEquals("carrot", carrotRecord.getKey().toString());
    List<Integer> carrotDocs = carrotRecord.getValue();
    assertEquals(1, carrotDocs.size());
    assertTrue(carrotDocs.contains(1));

    assertFalse(avroFileReader.hasNext());
    avroFileReader.close();
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileJob.java

License:Apache License

/**
 * The driver for the MapReduce job./*from   w  w w.ja  va2s.  c  o m*/
 *
 * @param conf           configuration
 * @param inputDirAsString  input directory in CSV-form
 * @param outputDirAsString output directory
 * @return true if the job completed successfully
 * @throws java.io.IOException         if something went wrong
 * @throws java.net.URISyntaxException if a URI wasn't correctly formed
 */
public boolean runJob(final Configuration conf, final String inputDirAsString, final String outputDirAsString)
        throws IOException, URISyntaxException, ClassNotFoundException, InterruptedException {

    Job job = new Job(conf);

    job.setJarByClass(CombineSequenceFileJob.class);
    job.setJobName("seqfilecombiner");

    job.setNumReduceTasks(0);

    //        job.setMapperClass(IdentityMapper.class);

    job.setInputFormatClass(CombineSequenceFileInputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job, inputDirAsString);
    FileOutputFormat.setOutputPath(job, new Path(outputDirAsString));

    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    boolean jobResult = job.waitForCompletion(true);
    Date endTime = new Date();
    System.out.println("Job ended: " + endTime);
    System.out.println("The job took "
            + TimeUnit.MILLISECONDS.toSeconds(endTime.getTime() - startTime.getTime()) + " seconds.");

    return jobResult;
}

From source file:com.alexholmes.json.mapreduce.ExampleJob.java

License:Apache License

/**
 * The MapReduce driver - setup and launch the job.
 *
 * @param args the command-line arguments
 * @return the process exit code//from ww  w.  j av a2s.  c  o  m
 * @throws Exception if something goes wrong
 */
public int run(final String[] args) throws Exception {

    String input = args[0];
    String output = args[1];

    Configuration conf = super.getConf();

    writeInput(conf, new Path(input));

    Job job = new Job(conf);
    job.setJarByClass(ExampleJob.class);
    job.setMapperClass(Map.class);

    job.setNumReduceTasks(0);

    Path outputPath = new Path(output);

    FileInputFormat.setInputPaths(job, input);
    FileOutputFormat.setOutputPath(job, outputPath);

    // use the JSON input format
    job.setInputFormatClass(MultiLineJsonInputFormat.class);

    // specify the JSON attribute name which is used to determine which
    // JSON elements are supplied to the mapper
    MultiLineJsonInputFormat.setInputJsonMember(job, "colorName");

    if (job.waitForCompletion(true)) {
        return 0;
    }
    return 1;
}