Example usage for org.apache.hadoop.mapred TextOutputFormat TextOutputFormat

List of usage examples for org.apache.hadoop.mapred TextOutputFormat TextOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred TextOutputFormat TextOutputFormat.

Prototype

TextOutputFormat

Source Link

Usage

From source file:eu.stratosphere.hadoopcompatibility.example.WordCountWithHadoopOutputFormat.java

License:Apache License

@Override
public Plan getPlan(String... args) {
    // parse job parameters
    int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String dataInput = (args.length > 1 ? args[1] : "");
    String output = (args.length > 2 ? args[2] : "");

    HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>(
            new TextInputFormat(), new JobConf(), "Input Lines");
    TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));

    MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build();
    ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper)
            .name("Count Words").build();
    HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), new JobConf(), "Hadoop TextOutputFormat", reducer,
            Text.class, IntWritable.class);
    TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));

    Plan plan = new Plan(out, "Hadoop OutputFormat Example");
    plan.setDefaultParallelism(numSubTasks);
    return plan;//from w  ww  .j a va 2s.c  o m
}

From source file:eu.stratosphere.hadoopcompatibility.mapred.example.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;/*from   w ww  . j a  va2s.  co  m*/
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setDegreeOfParallelism(1);

    // Set up the Hadoop Input Format
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
    TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

    // Create a Stratosphere job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}

From source file:org.apache.flink.hadoopcompatibility.mapred.example.HadoopMapredCompatWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;//from   w ww.  j  a v  a2s.co m
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    // Set up the Hadoop Input Format
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
    TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    DataSet<Tuple2<Text, LongWritable>> words = text
            .flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())).groupBy(0)
            .reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(),
                    new Counter()));

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(
            new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

    // Output & Execute
    words.output(hadoopOutputFormat).setParallelism(1);
    env.execute("Hadoop Compat WordCount");
}

From source file:org.apache.flink.hadoopcompatibility.mapred.example.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;/* w  w w  .j  a v  a 2 s .co m*/
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setDegreeOfParallelism(1);

    // Set up the Hadoop Input Format
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
    TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}

From source file:org.apache.flink.test.hadoop.mapred.WordCountMapredITCase.java

License:Apache License

@Override
protected void testProgram() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<Tuple2<LongWritable, Text>> input = env.readHadoopFile(new TextInputFormat(), LongWritable.class,
            Text.class, textPath);

    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
        @Override/*ww w  . j ava  2 s .  c om*/
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });

    DataSet<Tuple2<String, Integer>> counts =
            // split up the lines in pairs (2-tuples) containing: (word,1)
            text.flatMap(new Tokenizer())
                    // group by the tuple field "0" and sum up tuple field "1"
                    .groupBy(0).sum(1);

    DataSet<Tuple2<Text, LongWritable>> words = counts
            .map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

                @Override
                public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
                    return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
                }
            });

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(
            new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath));

    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}

From source file:org.apache.flink.test.hadoopcompatibility.mapred.WordCountMapredITCase.java

License:Apache License

private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<Tuple2<LongWritable, Text>> input;

    if (isTestDeprecatedAPI) {
        input = env.createInput(//from  w ww  .  j ava  2s  . co m
                HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    } else {
        input = env
                .createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    }

    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
        @Override
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });

    DataSet<Tuple2<String, Integer>> counts =
            // split up the lines in pairs (2-tuples) containing: (word,1)
            text.flatMap(new Tokenizer())
                    // group by the tuple field "0" and sum up tuple field "1"
                    .groupBy(0).sum(1);

    DataSet<Tuple2<Text, LongWritable>> words = counts
            .map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

                @Override
                public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
                    return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
                }
            });

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(
            new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath));

    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}