List of usage examples for org.apache.hadoop.mapred FileInputFormat getRecordReader
public abstract RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter) throws IOException;
From source file:cascading.tap.hadoop.io.CombineFileRecordReaderWrapper.java
License:Open Source License
public CombineFileRecordReaderWrapper(CombineFileSplit split, Configuration conf, Reporter reporter, Integer idx) throws Exception { FileSplit fileSplit = new FileSplit(split.getPath(idx), split.getOffset(idx), split.getLength(idx), split.getLocations());//from w w w . j a v a 2 s.c om Class<?> clz = conf.getClass(INDIVIDUAL_INPUT_FORMAT, null); FileInputFormat<K, V> inputFormat = (FileInputFormat<K, V>) clz.newInstance(); if (inputFormat instanceof Configurable) ((Configurable) inputFormat).setConf(conf); delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter); }
From source file:com.hdfs.concat.crush.CrushReducer.java
License:Apache License
@SuppressWarnings("unchecked") private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException { LOG.info(format("Opening '%s'", inputPath)); Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList .get(idx);//w ww . j a v a2s.c om try { FileInputFormat.setInputPaths(job, inputPath); FileInputFormat<?, ?> instance = cls.newInstance(); if (instance instanceof JobConfigurable) { ((JobConfigurable) instance).configure(job); } InputSplit[] splits = instance.getSplits(job, 1); if (1 != splits.length) { throw new IllegalArgumentException("Could not get input splits: " + inputPath); } return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.m6d.filecrush.crush.CrushReducer.java
License:Apache License
@SuppressWarnings("unchecked") private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter) throws IOException { LOG.info(format("Opening '%s'", inputPath)); Class<? extends FileInputFormat<?, ?>> cls = getInputFormatClass(idx); try {/*w w w. j a v a2 s.c o m*/ FileInputFormat.setInputPaths(job, inputPath); FileInputFormat<?, ?> instance = cls.newInstance(); if (instance instanceof JobConfigurable) { ((JobConfigurable) instance).configure(job); } InputSplit[] splits = instance.getSplits(job, 1); if (1 != splits.length) { throw new IllegalArgumentException("Could not get input splits: " + inputPath); } return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter); } catch (RuntimeException e) { throw e; } catch (IOException e) { throw e; } catch (Exception e) { throw new RuntimeException(e); } }
From source file:gobblin.source.extractor.hadoop.OldApiHadoopFileInputSource.java
License:Apache License
@Override public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException { if (!workUnitState.contains(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY)) { throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId()); }/* w w w . jav a2 s . com*/ JobConf jobConf = new JobConf(new Configuration()); for (String key : workUnitState.getPropertyNames()) { jobConf.set(key, workUnitState.getProp(key)); } String fileSplitBytesStr = workUnitState.getProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY); FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr); FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, jobConf); RecordReader<K, V> recordReader = fileInputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL); boolean readKeys = workUnitState.getPropAsBoolean(HadoopFileInputSource.FILE_INPUT_READ_KEYS_KEY, HadoopFileInputSource.DEFAULT_FILE_INPUT_READ_KEYS); return getExtractor(workUnitState, recordReader, fileSplit, readKeys); }
From source file:org.icgc.dcc.release.core.hadoop.CombineFileRecordReaderWrapper.java
License:Open Source License
protected CombineFileRecordReaderWrapper(FileInputFormat<K, V> inputFormat, CombineFileSplit split, Configuration conf, Reporter reporter, Integer index) throws IOException { val fileSplit = new FileSplit(split.getPath(index), split.getOffset(index), split.getLength(index), split.getLocations());//from w ww .jav a 2 s .c o m delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter); }
From source file:org.pooledtimeseries.cartesian.CartesianRecordReader.java
License:Apache License
/** * Creates a new instance of the CartesianRecordReader * /* www . j a v a 2s . com*/ * @param split * @param conf * @param reporter * @throws IOException */ public CartesianRecordReader(CompositeInputSplit split, JobConf conf, Reporter reporter) throws IOException { this.rightConf = conf; this.rightIS = split.get(1); this.rightReporter = reporter; try { // Create left record reader FileInputFormat<Text, BytesWritable> leftFIF = (FileInputFormat) ReflectionUtils .newInstance(Class.forName(conf.get(CartesianInputFormat.LEFT_INPUT_FORMAT)), conf); leftRR = leftFIF.getRecordReader(split.get(0), conf, reporter); // Create right record reader rightFIF = (FileInputFormat) ReflectionUtils .newInstance(Class.forName(conf.get(CartesianInputFormat.RIGHT_INPUT_FORMAT)), conf); rightRR = rightFIF.getRecordReader(rightIS, rightConf, rightReporter); } catch (ClassNotFoundException e) { e.printStackTrace(); throw new IOException(e); } // Create key value pairs for parsing lkey = (K1) this.leftRR.createKey(); lvalue = (V1) this.leftRR.createValue(); rkey = (K2) this.rightRR.createKey(); rvalue = (V2) this.rightRR.createValue(); }
From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java
License:Apache License
private static List<String> collect(FileInputFormat<Text, Text> format, JobConf job, int n, Reporter reporter) throws IOException { List<String> found = new ArrayList<String>(); for (InputSplit split : format.getSplits(job, n)) { RecordReader<Text, Text> reader = format.getRecordReader(split, job, reporter); Text key = reader.createKey(); Text value = reader.createValue(); try {//from ww w . j av a2 s .c om while (reader.next(key, value)) { found.add(key.toString()); } } finally { reader.close(); } } return found; }
From source file:parquet.hive.TestDeprecatedParquetInputFormat.java
License:Apache License
private void readParquetHiveInputFormat(final String schemaRequested, final Integer[] arrCheckIndexValues) throws Exception { final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath())); final MessageType schema = readFooter.getFileMetaData().getSchema(); long size = 0; final List<BlockMetaData> blocks = readFooter.getBlocks(); for (final BlockMetaData block : blocks) { size += block.getTotalByteSize(); }//from w w w . ja v a 2 s. c om final FileInputFormat<Void, ArrayWritable> format = new DeprecatedParquetInputFormat(); final String[] locations = new String[] { "localhost" }; final String schemaToString = schema.toString(); System.out.println(schemaToString); final String specificSchema = schemaRequested == null ? schemaToString : schemaRequested; // Set the configuration parameters final String columnsStr = "message customer {\n" + " optional int32 c_custkey;\n" + " optional binary c_name;\n" + " optional binary c_address;\n" + " optional int32 c_nationkey;\n" + " optional binary c_phone;\n" + " optional double c_acctbal;\n" + " optional binary c_mktsegment;\n" + " optional binary c_comment;\n" + " optional group c_map (MAP_KEY_VALUE) {\n" + " repeated group map {\n" + " required binary key;\n" + " optional binary value;\n" + " }\n" + " }\n" + " optional group c_list (LIST) {\n" + " repeated group bag {\n" + " optional int32 array_element;\n" + " }\n" + " }\n" + " optional int32 unknown;\n" + "}"; final Map<String, String> readSupportMetaData = new HashMap<String, String>(); readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, columnsStr); final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size, locations, blocks, schemaToString, specificSchema, readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData); final DeprecatedParquetInputFormat.InputSplitWrapper splitWrapper = new InputSplitWrapper(realSplit); // construct the record reader final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter); // create key/value final Void key = reader.createKey(); final ArrayWritable value = reader.createValue(); int count = 0; final int sizeExpected = mapData.size(); while (reader.next(key, value)) { assertTrue(count < sizeExpected); assertTrue(key == null); final Writable[] arrValue = value.get(); final ArrayWritable expected = mapData.get(((IntWritable) arrValue[0]).get()); final Writable[] arrExpected = expected.get(); assertEquals(arrValue.length, arrExpected.length); final boolean deepEquals = UtilitiesTestMethods.smartCheckArray(arrValue, arrExpected, arrCheckIndexValues); assertTrue(deepEquals); count++; } System.out.println("nb lines " + count); reader.close(); assertEquals("Number of lines found and data written don't match", count, sizeExpected); }
From source file:parquet.hive.TestDeprecatedParquetOuputFormat.java
License:Apache License
private void checkWrite() throws IOException, InterruptedException { final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath())); final MessageType schema = readFooter.getFileMetaData().getSchema(); long size = 0; final List<BlockMetaData> blocks = readFooter.getBlocks(); for (final BlockMetaData block : blocks) { size += block.getTotalByteSize(); }//from www . ja v a 2 s. c o m final FileInputFormat<Void, ArrayWritable> format = new DeprecatedParquetInputFormat(); final String[] locations = new String[] { "localhost" }; final String schemaToString = schema.toString(); final String columnsStr = "message customer {\n" + " optional int32 c_custkey;\n" + " optional binary c_name;\n" + " optional binary c_address;\n" + " optional int32 c_nationkey;\n" + " optional binary c_phone;\n" + " optional double c_acctbal;\n" + " optional binary c_mktsegment;\n" + " optional binary c_comment;\n" + " optional group c_map (MAP_KEY_VALUE) {\n" + " repeated group map {\n" + " required binary key;\n" + " optional binary value;\n" + " }\n" + " }\n" + " optional group c_list (LIST) {\n" + " repeated group bag {\n" + " optional int32 array_element;\n" + " }\n" + " }\n" + "}"; final Map<String, String> readSupportMetaData = new HashMap<String, String>(); readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, columnsStr); final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size, locations, blocks, schemaToString, schemaToString, readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData); final DeprecatedParquetInputFormat.InputSplitWrapper splitWrapper = new DeprecatedParquetInputFormat.InputSplitWrapper( realSplit); // construct the record reader final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter); // create key/value final Void key = reader.createKey(); final ArrayWritable value = reader.createValue(); int count = 0; while (reader.next(key, value)) { assertTrue(count < mapData.size()); assertTrue(key == null); final Writable[] arrValue = value.get(); final Writable[] writableArr = arrValue; final ArrayWritable expected = mapData.get(((IntWritable) writableArr[0]).get()); final Writable[] arrExpected = expected.get(); assertEquals(arrValue.length, 10); final boolean deepEquals = UtilitiesTestMethods.smartCheckArray(arrValue, arrExpected, new Integer[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }); assertTrue(deepEquals); count++; } reader.close(); assertEquals("Number of lines found and data written don't match", count, mapData.size()); }
From source file:parquet.hive.TestMapredParquetInputFormat.java
License:Apache License
@Test public void testGetSplit() throws Exception { final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath())); final MessageType fileSchema = readFooter.getFileMetaData().getSchema(); final MessageType requestedSchema = MessageTypeParser.parseMessageType("message customer {\n" + " optional int32 c_custkey;\n" + " optional binary c_name;\n" + " optional double c_acctbal;\n" + " optional binary c_mktsegment;\n" + " optional binary c_comment;\n" + "}"); final MessageType hiveSchema = MessageTypeParser.parseMessageType("message customer {\n" + " optional int32 c_custkey;\n" + " optional binary c_name;\n" + " optional binary c_address;\n" + " optional int32 c_nationkey;\n" + " optional binary c_phone;\n" + " optional double c_acctbal;\n" + " optional binary c_mktsegment;\n" + " optional binary c_comment;\n" + " optional group c_map (MAP_KEY_VALUE) {\n" + " repeated group map {\n" + " required binary key;\n" + " optional binary value;\n" + " }\n" + " }\n" + " optional group c_list (LIST) {\n" + " repeated group bag {\n" + " optional int32 array_element;\n" + " }\n" + " }\n" + " optional binary unknown;\n" + "}"); // Put columns and projection info in the conf List<String> columns = new ArrayList<String>(); List<Integer> readColumns = new ArrayList<Integer>(); for (int i = 0; i < hiveSchema.getFieldCount(); ++i) { final String name = hiveSchema.getType(i).getName(); columns.add(name);/*ww w . j av a 2 s . c o m*/ if (requestedSchema.containsField(name)) { readColumns.add(i); } } job.set("columns", StringUtils.join(columns, ",")); ColumnProjectionUtils.setReadColumnIDs(job, readColumns); long size = 0; final List<BlockMetaData> blocks = readFooter.getBlocks(); for (final BlockMetaData block : blocks) { size += block.getTotalByteSize(); } final FileInputFormat<Void, ArrayWritable> format = new MapredParquetInputFormat(); final String[] locations = new String[] { "localhost" }; final Map<String, String> readSupportMetaData = new HashMap<String, String>(); readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, hiveSchema.toString()); final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size, locations, blocks, fileSchema.toString(), requestedSchema.toString(), readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData); final MapredParquetInputFormat.InputSplitWrapper splitWrapper = new InputSplitWrapper(realSplit); // construct the record reader final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter); assertEquals("Wrong real split inside wrapper", realSplit, ((MapredParquetInputFormat.RecordReaderWrapper) reader).getSplit(splitWrapper, job)); // Recreate the split using getSplit, as Hive would final FileSplit fileSplit = new FileSplit(splitWrapper.getPath(), splitWrapper.getStart(), splitWrapper.getLength(), splitWrapper.getLocations()); final ParquetInputSplit recreatedSplit = ((MapredParquetInputFormat.RecordReaderWrapper) reader) .getSplit(fileSplit, job); assertTrue("Wrong file schema", UtilitiesTestMethods.smartCheckSchema(fileSchema, MessageTypeParser.parseMessageType(recreatedSplit.getFileSchema()))); assertTrue("Wrong requested schema", UtilitiesTestMethods.smartCheckSchema(requestedSchema, MessageTypeParser.parseMessageType(recreatedSplit.getRequestedSchema()))); assertTrue("Wrong hive schema", UtilitiesTestMethods.smartCheckSchema(hiveSchema, MessageTypeParser.parseMessageType( recreatedSplit.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)))); }