List of usage examples for org.apache.hadoop.mapred TextOutputFormat TextOutputFormat
TextOutputFormat
From source file:eu.stratosphere.hadoopcompatibility.example.WordCountWithHadoopOutputFormat.java
License:Apache License
@Override public Plan getPlan(String... args) { // parse job parameters int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1); String dataInput = (args.length > 1 ? args[1] : ""); String output = (args.length > 2 ? args[2] : ""); HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>( new TextInputFormat(), new JobConf(), "Input Lines"); TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput)); MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build(); ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper) .name("Count Words").build(); HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>( new TextOutputFormat<Text, IntWritable>(), new JobConf(), "Hadoop TextOutputFormat", reducer, Text.class, IntWritable.class); TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output)); Plan plan = new Plan(out, "Hadoop OutputFormat Example"); plan.setDefaultParallelism(numSubTasks); return plan;//from w ww .j a va 2s.c o m }
From source file:eu.stratosphere.hadoopcompatibility.mapred.example.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return;/*from w ww . j a va2s. co m*/ } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setDegreeOfParallelism(1); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>( new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Stratosphere job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>( new TextOutputFormat<Text, IntWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
From source file:org.apache.flink.hadoopcompatibility.mapred.example.HadoopMapredCompatWordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return;//from w ww. j a v a2s.co m } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>( new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); DataSet<Tuple2<Text, LongWritable>> words = text .flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())).groupBy(0) .reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(), new Counter())); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>( new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute words.output(hadoopOutputFormat).setParallelism(1); env.execute("Hadoop Compat WordCount"); }
From source file:org.apache.flink.hadoopcompatibility.mapred.example.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return;/* w w w .j a v a 2 s .co m*/ } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setDegreeOfParallelism(1); // Set up the Hadoop Input Format HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>( new TextInputFormat(), LongWritable.class, Text.class, new JobConf()); TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>( new TextOutputFormat<Text, IntWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
From source file:org.apache.flink.test.hadoop.mapred.WordCountMapredITCase.java
License:Apache License
@Override protected void testProgram() throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<LongWritable, Text>> input = env.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath); DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() { @Override/*ww w . j ava 2 s . c om*/ public String map(Tuple2<LongWritable, Text> value) throws Exception { return value.f1.toString(); } }); DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1) text.flatMap(new Tokenizer()) // group by the tuple field "0" and sum up tuple field "1" .groupBy(0).sum(1); DataSet<Tuple2<Text, LongWritable>> words = counts .map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() { @Override public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception { return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1)); } }); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>( new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath)); // Output & Execute words.output(hadoopOutputFormat); env.execute("Hadoop Compat WordCount"); }
From source file:org.apache.flink.test.hadoopcompatibility.mapred.WordCountMapredITCase.java
License:Apache License
private void internalRun(boolean isTestDeprecatedAPI) throws Exception { final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Tuple2<LongWritable, Text>> input; if (isTestDeprecatedAPI) { input = env.createInput(//from w ww . j ava 2s . co m HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath)); } else { input = env .createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath)); } DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() { @Override public String map(Tuple2<LongWritable, Text> value) throws Exception { return value.f1.toString(); } }); DataSet<Tuple2<String, Integer>> counts = // split up the lines in pairs (2-tuples) containing: (word,1) text.flatMap(new Tokenizer()) // group by the tuple field "0" and sum up tuple field "1" .groupBy(0).sum(1); DataSet<Tuple2<Text, LongWritable>> words = counts .map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() { @Override public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception { return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1)); } }); // Set up Hadoop Output Format HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>( new TextOutputFormat<Text, LongWritable>(), new JobConf()); hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " "); TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath)); // Output & Execute words.output(hadoopOutputFormat); env.execute("Hadoop Compat WordCount"); }