List of usage examples for org.apache.hadoop.mapred FileInputFormat addInputPaths
public static void addInputPaths(JobConf conf, String commaSeparatedPaths)
From source file:cn.edu.xmu.dm.mapreduce.MultiFileWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();//w w w. java 2s. c o m return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); // set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(LongWritable.class); // use the defined mapper job.setMapperClass(MapClass.class); // use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
From source file:com.benchmark.mapred.MultiFileWordCount.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();/*from w w w .j a v a 2 s.co m*/ return 1; } JobConf job = new JobConf(getConf(), MultiFileWordCount.class); job.setJobName("MultiFileWordCount"); //set the InputFormat of the job to our InputFormat job.setInputFormat(MyInputFormat.class); // the keys are words (strings) job.setOutputKeyClass(Text.class); // the values are counts (ints) job.setOutputValueClass(LongWritable.class); //use the defined mapper job.setMapperClass(MapClass.class); //use the WordCount Reducer job.setCombinerClass(LongSumReducer.class); job.setReducerClass(LongSumReducer.class); FileInputFormat.addInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); JobClient.runJob(job); return 0; }
From source file:Corrector.Correction.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: Correction [0/7]"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(Correction.class); conf.setJobName("Correction " + inputPath + " " + Config.K); Config.initializeConfiguration(conf); FileInputFormat.addInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); //conf.setBoolean("mapred.output.compress", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(CorrectionMapper.class); conf.setReducerClass(CorrectionReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:Corrector.PCorrection.java
License:Apache License
public RunningJob run(String inputPath, String outputPath) throws Exception { sLogger.info("Tool name: PCorrection"); sLogger.info(" - input: " + inputPath); sLogger.info(" - output: " + outputPath); JobConf conf = new JobConf(PCorrection.class); conf.setJobName("PCorrection " + inputPath + " " + Config.K); Config.initializeConfiguration(conf); FileInputFormat.addInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); //conf.setBoolean("mapred.output.compress", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapperClass(PCorrectionMapper.class); conf.setReducerClass(PCorrectionReducer.class); //delete the output directory if it exists already FileSystem.get(conf).delete(new Path(outputPath), true); return JobClient.runJob(conf); }
From source file:fire.util.fileformats.iomapred.LoadBinaryToSequence.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 2) { printUsage();/*from ww w. j ava2s .c om*/ return 2; } JobConf conf = new JobConf(LoadBinaryToSequence.class); conf.setJobName("loadbinarytosequence"); //set the InputFormat of the job to our InputFormat conf.setInputFormat(CombineFileBinaryInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // the keys are words (strings) conf.setOutputKeyClass(Text.class); // the values are images conf.setOutputValueClass(BytesWritable.class); //use the defined mapper conf.setMapperClass(MapClass.class); FileInputFormat.addInputPaths(conf, args[0]); FileOutputFormat.setOutputPath(conf, new Path(args[1])); JobClient.runJob(conf); return 0; }
From source file:gobblin.data.management.copy.hive.HiveUtils.java
License:Apache License
/** * Get paths from a Hive location using the provided input format. *///from w w w. j a va 2 s. c om public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException { JobConf jobConf = new JobConf(getHadoopConfiguration()); Set<Path> paths = Sets.newHashSet(); FileInputFormat.addInputPaths(jobConf, location.toString()); InputSplit[] splits = inputFormat.getSplits(jobConf, 1000); for (InputSplit split : splits) { if (!(split instanceof FileSplit)) { throw new IOException("Not a file split. Found " + split.getClass().getName()); } FileSplit fileSplit = (FileSplit) split; paths.add(fileSplit.getPath()); } return paths; }
From source file:io.bfscan.clueweb12.DumpWarcRecordsToPlainText.java
License:Apache License
/** * Runs this tool./*www .j av a 2 s .c om*/ */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); LOG.info("Tool name: " + DumpWarcRecordsToPlainText.class.getSimpleName()); LOG.info(" - input: " + input); LOG.info(" - output: " + output); JobConf conf = new JobConf(getConf(), DumpWarcRecordsToPlainText.class); conf.setJobName(DumpWarcRecordsToPlainText.class.getSimpleName() + ":" + input); conf.setNumReduceTasks(0); FileInputFormat.addInputPaths(conf, input); FileOutputFormat.setOutputPath(conf, new Path(output)); conf.setInputFormat(ClueWeb12InputFormat.class); conf.setOutputFormat(TextOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); return 0; }
From source file:org.apache.avro.mapred.tether.TestWordCountTether.java
License:Apache License
@Test @SuppressWarnings("deprecation") public void testJob() throws Exception { System.out.println(System.getProperty("java.class.path")); JobConf job = new JobConf(); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path outputPath = new Path(dir + "/out"); outputPath.getFileSystem(job).delete(outputPath); // create the input file WordCountUtil.writeLinesFile();/*from ww w. j a v a 2 s .c o m*/ File exec = new File(System.getProperty("java.home") + "/bin/java"); //input path String in = dir + "/in"; //create a string of the arguments List<String> execargs = new ArrayList<String>(); execargs.add("-classpath"); execargs.add(System.getProperty("java.class.path")); execargs.add("org.apache.avro.mapred.tether.WordCountTask"); FileInputFormat.addInputPaths(job, in); FileOutputFormat.setOutputPath(job, outputPath); TetherJob.setExecutable(job, exec, execargs, false); Schema outscheme = new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema(); AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING)); job.set(AvroJob.OUTPUT_SCHEMA, outscheme.toString()); TetherJob.runJob(job); // validate the output DatumReader<Pair<Utf8, Long>> reader = new SpecificDatumReader<Pair<Utf8, Long>>(); InputStream cin = new BufferedInputStream(new FileInputStream(WordCountUtil.COUNTS_FILE)); DataFileStream<Pair<Utf8, Long>> counts = new DataFileStream<Pair<Utf8, Long>>(cin, reader); int numWords = 0; for (Pair<Utf8, Long> wc : counts) { assertEquals(wc.key().toString(), WordCountUtil.COUNTS.get(wc.key().toString()), wc.value()); numWords++; } cin.close(); assertEquals(WordCountUtil.COUNTS.size(), numWords); }
From source file:org.apache.avro.tool.TetherTool.java
License:Apache License
@Override public int run(InputStream ins, PrintStream outs, PrintStream err, List<String> args) throws Exception { OptionParser p = new OptionParser(); OptionSpec<File> exec = p.accepts("program", "executable program, usually in HDFS").withRequiredArg() .ofType(File.class); OptionSpec<String> in = p.accepts("in", "comma-separated input paths").withRequiredArg() .ofType(String.class); OptionSpec<Path> out = p.accepts("out", "output directory").withRequiredArg().ofType(Path.class); OptionSpec<File> outSchema = p.accepts("outschema", "output schema file").withRequiredArg() .ofType(File.class); OptionSpec<File> mapOutSchema = p.accepts("outschemamap", "map output schema file, if different") .withOptionalArg().ofType(File.class); OptionSpec<Integer> reduces = p.accepts("reduces", "number of reduces").withOptionalArg() .ofType(Integer.class); JobConf job = new JobConf(); try {/*from w w w. j ava 2s .co m*/ OptionSet opts = p.parse(args.toArray(new String[0])); FileInputFormat.addInputPaths(job, in.value(opts)); FileOutputFormat.setOutputPath(job, out.value(opts)); TetherJob.setExecutable(job, exec.value(opts)); job.set(AvroJob.OUTPUT_SCHEMA, Schema.parse(outSchema.value(opts)).toString()); if (opts.hasArgument(mapOutSchema)) job.set(AvroJob.MAP_OUTPUT_SCHEMA, Schema.parse(mapOutSchema.value(opts)).toString()); if (opts.hasArgument(reduces)) job.setNumReduceTasks(reduces.value(opts)); } catch (Exception e) { p.printHelpOn(err); return -1; } TetherJob.runJob(job); return 0; }
From source file:org.clueweb.clueweb09.app.CountWarcRecordsOld.java
License:Apache License
/** * Runs this tool.//from ww w . j a va 2s . c om */ @SuppressWarnings("static-access") public int run(String[] args) throws Exception { Options options = new Options(); options.addOption( OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String input = cmdline.getOptionValue(INPUT_OPTION); LOG.info("Tool name: " + CountWarcRecordsOld.class.getSimpleName()); LOG.info(" - input: " + input); JobConf conf = new JobConf(getConf(), CountWarcRecordsOld.class); conf.setJobName(CountWarcRecordsOld.class.getSimpleName() + ":" + input); conf.setNumReduceTasks(0); FileInputFormat.addInputPaths(conf, input); conf.setInputFormat(ClueWeb09InputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int numDocs = (int) counters.findCounter(Records.PAGES).getCounter(); LOG.info("Read " + numDocs + " docs."); return 0; }