Example usage for org.apache.hadoop.mapred FileInputFormat addInputPaths

List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat addInputPaths.

Prototype

public static void addInputPaths(JobConf conf, String commaSeparatedPaths) 

Source Link

Document

Add the given comma separated paths to the list of inputs for the map-reduce job.

Usage

From source file:cn.edu.xmu.dm.mapreduce.MultiFileWordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 2) {
        printUsage();//w  w w. java  2s. c  o  m
        return 1;
    }

    JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
    job.setJobName("MultiFileWordCount");

    // set the InputFormat of the job to our InputFormat
    job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(LongWritable.class);

    // use the defined mapper
    job.setMapperClass(MapClass.class);
    // use the WordCount Reducer
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);

    return 0;
}

From source file:com.benchmark.mapred.MultiFileWordCount.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 2) {
        printUsage();/*from   w w  w  .j a  v  a  2 s.co m*/
        return 1;
    }

    JobConf job = new JobConf(getConf(), MultiFileWordCount.class);
    job.setJobName("MultiFileWordCount");

    //set the InputFormat of the job to our InputFormat
    job.setInputFormat(MyInputFormat.class);

    // the keys are words (strings)
    job.setOutputKeyClass(Text.class);
    // the values are counts (ints)
    job.setOutputValueClass(LongWritable.class);

    //use the defined mapper
    job.setMapperClass(MapClass.class);
    //use the WordCount Reducer
    job.setCombinerClass(LongSumReducer.class);
    job.setReducerClass(LongSumReducer.class);

    FileInputFormat.addInputPaths(job, args[0]);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    JobClient.runJob(job);

    return 0;
}

From source file:Corrector.Correction.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: Correction [0/7]");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(Correction.class);
    conf.setJobName("Correction " + inputPath + " " + Config.K);

    Config.initializeConfiguration(conf);

    FileInputFormat.addInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    //conf.setBoolean("mapred.output.compress", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(CorrectionMapper.class);
    conf.setReducerClass(CorrectionReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:Corrector.PCorrection.java

License:Apache License

public RunningJob run(String inputPath, String outputPath) throws Exception {
    sLogger.info("Tool name: PCorrection");
    sLogger.info(" - input: " + inputPath);
    sLogger.info(" - output: " + outputPath);

    JobConf conf = new JobConf(PCorrection.class);
    conf.setJobName("PCorrection " + inputPath + " " + Config.K);

    Config.initializeConfiguration(conf);

    FileInputFormat.addInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    //conf.setBoolean("mapred.output.compress", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);

    conf.setMapperClass(PCorrectionMapper.class);
    conf.setReducerClass(PCorrectionReducer.class);

    //delete the output directory if it exists already
    FileSystem.get(conf).delete(new Path(outputPath), true);

    return JobClient.runJob(conf);
}

From source file:fire.util.fileformats.iomapred.LoadBinaryToSequence.java

License:Apache License

public int run(String[] args) throws Exception {

    if (args.length < 2) {
        printUsage();/*from   ww  w. j ava2s  .c om*/
        return 2;
    }

    JobConf conf = new JobConf(LoadBinaryToSequence.class);
    conf.setJobName("loadbinarytosequence");

    //set the InputFormat of the job to our InputFormat
    conf.setInputFormat(CombineFileBinaryInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    // the keys are words (strings)
    conf.setOutputKeyClass(Text.class);
    // the values are images
    conf.setOutputValueClass(BytesWritable.class);

    //use the defined mapper
    conf.setMapperClass(MapClass.class);

    FileInputFormat.addInputPaths(conf, args[0]);
    FileOutputFormat.setOutputPath(conf, new Path(args[1]));

    JobClient.runJob(conf);

    return 0;
}

From source file:gobblin.data.management.copy.hive.HiveUtils.java

License:Apache License

/**
 * Get paths from a Hive location using the provided input format.
 *///from  w  w  w.  j  a  va 2 s.  c  om
public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException {
    JobConf jobConf = new JobConf(getHadoopConfiguration());

    Set<Path> paths = Sets.newHashSet();

    FileInputFormat.addInputPaths(jobConf, location.toString());
    InputSplit[] splits = inputFormat.getSplits(jobConf, 1000);
    for (InputSplit split : splits) {
        if (!(split instanceof FileSplit)) {
            throw new IOException("Not a file split. Found " + split.getClass().getName());
        }
        FileSplit fileSplit = (FileSplit) split;
        paths.add(fileSplit.getPath());
    }

    return paths;
}

From source file:io.bfscan.clueweb12.DumpWarcRecordsToPlainText.java

License:Apache License

/**
 * Runs this tool./*www  .j  av a 2  s .c  om*/
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);
    String output = cmdline.getOptionValue(OUTPUT_OPTION);

    LOG.info("Tool name: " + DumpWarcRecordsToPlainText.class.getSimpleName());
    LOG.info(" - input: " + input);
    LOG.info(" - output: " + output);

    JobConf conf = new JobConf(getConf(), DumpWarcRecordsToPlainText.class);
    conf.setJobName(DumpWarcRecordsToPlainText.class.getSimpleName() + ":" + input);

    conf.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(conf, input);
    FileOutputFormat.setOutputPath(conf, new Path(output));

    conf.setInputFormat(ClueWeb12InputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setMapperClass(MyMapper.class);

    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();

    LOG.info("Read " + numDocs + " docs.");

    return 0;
}

From source file:org.apache.avro.mapred.tether.TestWordCountTether.java

License:Apache License

@Test
@SuppressWarnings("deprecation")
public void testJob() throws Exception {

    System.out.println(System.getProperty("java.class.path"));
    JobConf job = new JobConf();
    String dir = System.getProperty("test.dir", ".") + "/mapred";
    Path outputPath = new Path(dir + "/out");

    outputPath.getFileSystem(job).delete(outputPath);

    // create the input file
    WordCountUtil.writeLinesFile();/*from  ww  w. j a  v a 2  s  .c o  m*/

    File exec = new File(System.getProperty("java.home") + "/bin/java");

    //input path
    String in = dir + "/in";

    //create a string of the arguments
    List<String> execargs = new ArrayList<String>();
    execargs.add("-classpath");
    execargs.add(System.getProperty("java.class.path"));
    execargs.add("org.apache.avro.mapred.tether.WordCountTask");

    FileInputFormat.addInputPaths(job, in);
    FileOutputFormat.setOutputPath(job, outputPath);
    TetherJob.setExecutable(job, exec, execargs, false);

    Schema outscheme = new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema();
    AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
    job.set(AvroJob.OUTPUT_SCHEMA, outscheme.toString());

    TetherJob.runJob(job);

    // validate the output
    DatumReader<Pair<Utf8, Long>> reader = new SpecificDatumReader<Pair<Utf8, Long>>();
    InputStream cin = new BufferedInputStream(new FileInputStream(WordCountUtil.COUNTS_FILE));
    DataFileStream<Pair<Utf8, Long>> counts = new DataFileStream<Pair<Utf8, Long>>(cin, reader);
    int numWords = 0;
    for (Pair<Utf8, Long> wc : counts) {
        assertEquals(wc.key().toString(), WordCountUtil.COUNTS.get(wc.key().toString()), wc.value());
        numWords++;
    }

    cin.close();
    assertEquals(WordCountUtil.COUNTS.size(), numWords);

}

From source file:org.apache.avro.tool.TetherTool.java

License:Apache License

@Override
public int run(InputStream ins, PrintStream outs, PrintStream err, List<String> args) throws Exception {

    OptionParser p = new OptionParser();
    OptionSpec<File> exec = p.accepts("program", "executable program, usually in HDFS").withRequiredArg()
            .ofType(File.class);
    OptionSpec<String> in = p.accepts("in", "comma-separated input paths").withRequiredArg()
            .ofType(String.class);
    OptionSpec<Path> out = p.accepts("out", "output directory").withRequiredArg().ofType(Path.class);
    OptionSpec<File> outSchema = p.accepts("outschema", "output schema file").withRequiredArg()
            .ofType(File.class);
    OptionSpec<File> mapOutSchema = p.accepts("outschemamap", "map output schema file, if different")
            .withOptionalArg().ofType(File.class);
    OptionSpec<Integer> reduces = p.accepts("reduces", "number of reduces").withOptionalArg()
            .ofType(Integer.class);

    JobConf job = new JobConf();

    try {/*from  w w w.  j ava  2s  .co m*/
        OptionSet opts = p.parse(args.toArray(new String[0]));
        FileInputFormat.addInputPaths(job, in.value(opts));
        FileOutputFormat.setOutputPath(job, out.value(opts));
        TetherJob.setExecutable(job, exec.value(opts));
        job.set(AvroJob.OUTPUT_SCHEMA, Schema.parse(outSchema.value(opts)).toString());
        if (opts.hasArgument(mapOutSchema))
            job.set(AvroJob.MAP_OUTPUT_SCHEMA, Schema.parse(mapOutSchema.value(opts)).toString());
        if (opts.hasArgument(reduces))
            job.setNumReduceTasks(reduces.value(opts));
    } catch (Exception e) {
        p.printHelpOn(err);
        return -1;
    }

    TetherJob.runJob(job);
    return 0;
}

From source file:org.clueweb.clueweb09.app.CountWarcRecordsOld.java

License:Apache License

/**
 * Runs this tool.//from   ww  w .  j a va 2s .  c  om
 */
@SuppressWarnings("static-access")
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String input = cmdline.getOptionValue(INPUT_OPTION);

    LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName());
    LOG.info(" - input: " + input);

    JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class);
    conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input);

    conf.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(conf, input);

    conf.setInputFormat(ClueWeb09InputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(MyMapper.class);

    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int numDocs = (int) counters.findCounter(Records.PAGES).getCounter();

    LOG.info("Read " + numDocs + " docs.");

    return 0;
}