Example usage for org.apache.hadoop.mapreduce.lib.input TextInputFormat TextInputFormat

List of usage examples for org.apache.hadoop.mapreduce.lib.input TextInputFormat TextInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input TextInputFormat TextInputFormat.

Prototype

TextInputFormat

Source Link

Usage

From source file:ar.com.datatsunami.pig.FixedWidthLoader.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override//from w  ww.  j a va 2s. c o m
public InputFormat getInputFormat() throws IOException {
    return new TextInputFormat();
}

From source file:com.acme.io.JsonLoader.java

License:Apache License

/**
 * This will be called during planning on the front end. This is the
 * instance of InputFormat (rather than the class name) because the 
 * load function may need to instantiate the InputFormat in order 
 * to control how it is constructed.//from   w ww . java 2  s  . c om
 * @return the InputFormat associated with this loader.
 * @throws IOException if there is an exception during InputFormat 
 * construction
 */
@SuppressWarnings("unchecked")
public InputFormat getInputFormat() throws IOException {
    // We will use TextInputFormat, the default Hadoop input format for
    // text.  It has a LongWritable key that we will ignore, and the value
    // is a Text (a string writable) that the JSON data is in.
    return new TextInputFormat();
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java

License:Apache License

@SuppressWarnings("rawtypes")
@Override/* w w  w. j a va2  s.c  o m*/
public InputFormat getInputFormat() throws IOException {
    AvroStorageLog.funcCall("getInputFormat");
    InputFormat result = null;
    if (inputAvroSchema != null) {
        result = new PigAvroInputFormat(inputAvroSchema, ignoreBadFiles, schemaToMergedSchemaMap,
                useMultipleSchemas);
    } else {
        result = new TextInputFormat();
    }
    return result;
}

From source file:com.mozilla.pig.load.DateRangeLoader.java

License:Apache License

@Override
@SuppressWarnings("rawtypes")
public InputFormat getInputFormat() throws IOException {
    return new TextInputFormat();
}

From source file:com.qq.pig.udf.CustomJsonLoader.java

License:Apache License

@SuppressWarnings("unchecked")
public InputFormat getInputFormat() throws IOException {
    // We will use TextInputFormat, the default Hadoop input format for
    // text.  It has a LongWritable key that we will ignore, and the value
    // is a Text (a string writable) that the JSON data is in.
    return new TextInputFormat();
}

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
        throws IOException, InterruptedException {
    TextInputFormat textInputFormat = new TextInputFormat();
    InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), null);
    TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf,
            TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));
    RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit,
            taskAttemptContext);/*from   www  .  j  a  v  a2 s  .  com*/
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean hasNext = recordReader.nextKeyValue();
    List<Map.Entry> batch = new ArrayList<>();
    while (hasNext && batch.size() < batchSize) {
        batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(),
                String.valueOf(recordReader.getCurrentValue())));
        hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
    }
    return batch;
}

From source file:crunch.MaxTemperature.java

License:Apache License

@Test
    public void text() throws Exception {
        String input = "On the top of the Crumpetty Tree\n" + "The Quangle Wangle sat,\n"
                + "But his face you could not see,\n" + "On account of his Beaver Hat.";

        writeInput(input);/* w  w  w  .  ja  va2s  . c om*/

        TextInputFormat format = new TextInputFormat();
        format.configure(conf);
        InputSplit[] splits = format.getSplits(conf, 1);
        RecordReader<LongWritable, Text> recordReader = format.getRecordReader(splits[0], conf, Reporter.NULL);
        checkNextLine(recordReader, 0, "On the top of the Crumpetty Tree");
        checkNextLine(recordReader, 33, "The Quangle Wangle sat,");
        checkNextLine(recordReader, 57, "But his face you could not see,");
        checkNextLine(recordReader, 89, "On account of his Beaver Hat.");
    }

From source file:crunch.MaxTemperature.java

License:Apache License

@Override
    public InputFormat getInputFormat() {
        return new TextInputFormat();
    }

From source file:eu.stratosphere.hadoopcompatibility.mapreduce.example.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;//from   ww w  . j  a  va 2s  .co  m
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setDegreeOfParallelism(1);

    // Set up the Hadoop Input Format
    Job job = Job.getInstance();
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, job);
    TextInputFormat.addInputPath(job, new Path(inputPath));

    // Create a Stratosphere job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), job);
    hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
    hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
    // is being executed with both types (hadoop1 and hadoop2 profile)
    TextOutputFormat.setOutputPath(job, new Path(outputPath));

    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}

From source file:org.apache.flink.hadoopcompatibility.mapreduce.example.WordCount.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: WordCount <input path> <result path>");
        return;/*from  ww  w .  j a  v a  2 s .c  o  m*/
    }

    final String inputPath = args[0];
    final String outputPath = args[1];

    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setDegreeOfParallelism(1);

    // Set up the Hadoop Input Format
    Job job = Job.getInstance();
    HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(
            new TextInputFormat(), LongWritable.class, Text.class, job);
    TextInputFormat.addInputPath(job, new Path(inputPath));

    // Create a Flink job with it
    DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);

    // Tokenize the line and convert from Writable "Text" to String for better handling
    DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());

    // Sum up the words
    DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);

    // Convert String back to Writable "Text" for use with Hadoop Output Format
    DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());

    // Set up Hadoop Output Format
    HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(
            new TextOutputFormat<Text, IntWritable>(), job);
    hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
    hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
    // is being executed with both types (hadoop1 and hadoop2 profile)
    TextOutputFormat.setOutputPath(job, new Path(outputPath));

    // Output & Execute
    hadoopResult.output(hadoopOutputFormat);
    env.execute("Word Count");
}