Example usage for org.apache.hadoop.mapred TextOutputFormat TextOutputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred TextOutputFormat TextOutputFormat.

Prototype

TextOutputFormat

Source Link

Usage

From source file:eu.stratosphere.hadoopcompatibility.example.WordCountWithHadoopOutputFormat.java

License:Apache License

@Override
public Plan getPlan(String... args) {
    // parse job parameters
    int numSubTasks = (args.length > 0 ? Integer.parseInt(args[0]) : 1);
    String dataInput = (args.length > 1 ? args[1] : "");
    String output = (args.length > 2 ? args[2] : "");

    HadoopDataSource<LongWritable, Text> source = new HadoopDataSource<LongWritable, Text>(
            new TextInputFormat(), new JobConf(), "Input Lines");
    TextInputFormat.addInputPath(source.getJobConf(), new Path(dataInput));

    MapOperator mapper = MapOperator.builder(new TokenizeLine()).input(source).name("Tokenize Lines").build();
    ReduceOperator reducer = ReduceOperator.builder(CountWords.class, StringValue.class, 0).input(mapper)
            .name("Count Words").build();
    HadoopDataSink<Text, IntWritable> out = new HadoopDataSink<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), new JobConf(), "Hadoop TextOutputFormat", reducer,
            Text.class, IntWritable.class);
    TextOutputFormat.setOutputPath(out.getJobConf(), new Path(output));

    Plan plan = new Plan(out, "Hadoop OutputFormat Example");
    plan.setDefaultParallelism(numSubTasks);
    return plan;//from w  ww  .j a va 2s.c  o m
}

From source file:eu.stratosphere.hadoopcompatibility.mapred.example.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;/*from   w ww  . j a  va2s.  co  m*/
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setDegreeOfParallelism(1);

    // Set up the Hadoop Input Format
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
    TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

    // Create a Stratosphere job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}

From source file:org.apache.flink.hadoopcompatibility.mapred.example.HadoopMapredCompatWordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;//from   w ww.  j  a v  a2s.co m
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    // Set up the Hadoop Input Format
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
    TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    DataSet<Tuple2<Text, LongWritable>> words = text
            .flatMap(new HadoopMapFunction<LongWritable, Text, Text, LongWritable>(new Tokenizer())).groupBy(0)
            .reduceGroup(new HadoopReduceCombineFunction<Text, LongWritable, Text, LongWritable>(new Counter(),
                    new Counter()));

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(
            new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

    // Output & Execute
    words.output(hadoopOutputFormat).setParallelism(1);
    env.execute("Hadoop Compat WordCount");
}

From source file:org.apache.flink.hadoopcompatibility.mapred.example.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;/* w  w w  .j  a v  a 2 s .co m*/
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setDegreeOfParallelism(1);

    // Set up the Hadoop Input Format
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, new JobConf());
    TextInputFormat.addInputPath(hadoopInputFormat.getJobConf(), new Path(inputPath));

    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(outputPath));

    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}

From source file:org.apache.flink.test.hadoop.mapred.WordCountMapredITCase.java

License:Apache License

@Override
protected void testProgram() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<Tuple2<LongWritable, Text>> input = env.readHadoopFile(new TextInputFormat(), LongWritable.class,
            Text.class, textPath);

    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
        @Override/*ww w  . j ava  2 s .  c om*/
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });

    DataSet<Tuple2<String, Integer>> counts =
            // split up the lines in pairs (2-tuples) containing: (word,1)
            text.flatMap(new Tokenizer())
                    // group by the tuple field "0" and sum up tuple field "1"
                    .groupBy(0).sum(1);

    DataSet<Tuple2<Text, LongWritable>> words = counts
            .map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

                @Override
                public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
                    return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
                }
            });

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(
            new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath));

    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}

From source file:org.apache.flink.test.hadoopcompatibility.mapred.WordCountMapredITCase.java

License:Apache License

private void internalRun(boolean isTestDeprecatedAPI) throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();

    DataSet<Tuple2<LongWritable, Text>> input;

    if (isTestDeprecatedAPI) {
        input = env.createInput(//from  w ww  .  j ava  2s  . co m
                HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    } else {
        input = env
                .createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
    }

    DataSet<String> text = input.map(new MapFunction<Tuple2<LongWritable, Text>, String>() {
        @Override
        public String map(Tuple2<LongWritable, Text> value) throws Exception {
            return value.f1.toString();
        }
    });

    DataSet<Tuple2<String, Integer>> counts =
            // split up the lines in pairs (2-tuples) containing: (word,1)
            text.flatMap(new Tokenizer())
                    // group by the tuple field "0" and sum up tuple field "1"
                    .groupBy(0).sum(1);

    DataSet<Tuple2<Text, LongWritable>> words = counts
            .map(new MapFunction<Tuple2<String, Integer>, Tuple2<Text, LongWritable>>() {

                @Override
                public Tuple2<Text, LongWritable> map(Tuple2<String, Integer> value) throws Exception {
                    return new Tuple2<Text, LongWritable>(new Text(value.f0), new LongWritable(value.f1));
                }
            });

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, LongWritable>(
            new TextOutputFormat<Text, LongWritable>(), new JobConf());
    hadoopOutputFormat.getJobConf().set("mapred.textoutputformat.separator", " ");
    TextOutputFormat.setOutputPath(hadoopOutputFormat.getJobConf(), new Path(resultPath));

    // Output & Execute
    words.output(hadoopOutputFormat);
    env.execute("Hadoop Compat WordCount");
}