List of usage examples for org.apache.hadoop.mapreduce.lib.input TextInputFormat TextInputFormat
TextInputFormat
From source file:ar.com.datatsunami.pig.FixedWidthLoader.java
License:Apache License
@SuppressWarnings("rawtypes") @Override//from w ww. j a va 2s. c o m public InputFormat getInputFormat() throws IOException { return new TextInputFormat(); }
From source file:com.acme.io.JsonLoader.java
License:Apache License
/** * This will be called during planning on the front end. This is the * instance of InputFormat (rather than the class name) because the * load function may need to instantiate the InputFormat in order * to control how it is constructed.//from w ww . java 2 s . c om * @return the InputFormat associated with this loader. * @throws IOException if there is an exception during InputFormat * construction */ @SuppressWarnings("unchecked") public InputFormat getInputFormat() throws IOException { // We will use TextInputFormat, the default Hadoop input format for // text. It has a LongWritable key that we will ignore, and the value // is a Text (a string writable) that the JSON data is in. return new TextInputFormat(); }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
@SuppressWarnings("rawtypes") @Override/* w w w. j a va2 s.c o m*/ public InputFormat getInputFormat() throws IOException { AvroStorageLog.funcCall("getInputFormat"); InputFormat result = null; if (inputAvroSchema != null) { result = new PigAvroInputFormat(inputAvroSchema, ignoreBadFiles, schemaToMergedSchemaMap, useMultipleSchemas); } else { result = new TextInputFormat(); } return result; }
From source file:com.mozilla.pig.load.DateRangeLoader.java
License:Apache License
@Override @SuppressWarnings("rawtypes") public InputFormat getInputFormat() throws IOException { return new TextInputFormat(); }
From source file:com.qq.pig.udf.CustomJsonLoader.java
License:Apache License
@SuppressWarnings("unchecked") public InputFormat getInputFormat() throws IOException { // We will use TextInputFormat, the default Hadoop input format for // text. It has a LongWritable key that we will ignore, and the value // is a Text (a string writable) that the JSON data is in. return new TextInputFormat(); }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java
License:Apache License
private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext);/*from www . j a v a2 s . com*/ recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
From source file:crunch.MaxTemperature.java
License:Apache License
@Test public void text() throws Exception { String input = "On the top of the Crumpetty Tree\n" + "The Quangle Wangle sat,\n" + "But his face you could not see,\n" + "On account of his Beaver Hat."; writeInput(input);/* w w w . ja va2s . c om*/ TextInputFormat format = new TextInputFormat(); format.configure(conf); InputSplit[] splits = format.getSplits(conf, 1); RecordReader<LongWritable, Text> recordReader = format.getRecordReader(splits[0], conf, Reporter.NULL); checkNextLine(recordReader, 0, "On the top of the Crumpetty Tree"); checkNextLine(recordReader, 33, "The Quangle Wangle sat,"); checkNextLine(recordReader, 57, "But his face you could not see,"); checkNextLine(recordReader, 89, "On account of his Beaver Hat."); }
From source file:crunch.MaxTemperature.java
License:Apache License
@Override public InputFormat getInputFormat() { return new TextInputFormat(); }
From source file:eu.stratosphere.hadoopcompatibility.mapreduce.example.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return;//from ww w . j a va 2s .co m } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setDegreeOfParallelism(1); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>( new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Stratosphere job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>( new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test // is being executed with both types (hadoop1 and hadoop2 profile) TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
From source file:org.apache.flink.hadoopcompatibility.mapreduce.example.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return;/*from ww w . j a v a 2 s .c o m*/ } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); env.setDegreeOfParallelism(1); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>( new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>( new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test // is being executed with both types (hadoop1 and hadoop2 profile) TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }