Example usage for org.apache.hadoop.mapred FileInputFormat getRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getRecordReader.

Prototype

public abstract RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException;

Source Link

Usage

From source file:cascading.tap.hadoop.io.CombineFileRecordReaderWrapper.java

License:Open Source License

public CombineFileRecordReaderWrapper(CombineFileSplit split, Configuration conf, Reporter reporter,
        Integer idx) throws Exception {
    FileSplit fileSplit = new FileSplit(split.getPath(idx), split.getOffset(idx), split.getLength(idx),
            split.getLocations());//from w  w w .  j  a v  a  2  s.c om

    Class<?> clz = conf.getClass(INDIVIDUAL_INPUT_FORMAT, null);
    FileInputFormat<K, V> inputFormat = (FileInputFormat<K, V>) clz.newInstance();

    if (inputFormat instanceof Configurable)
        ((Configurable) inputFormat).setConf(conf);

    delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter);
}

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList
            .get(idx);//w ww .  j a v  a2s.c om

    try {
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.m6d.filecrush.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = getInputFormatClass(idx);

    try {/*w w w. j a v  a2  s.c o m*/
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:gobblin.source.extractor.hadoop.OldApiHadoopFileInputSource.java

License:Apache License

@Override
public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException {
    if (!workUnitState.contains(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY)) {
        throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
    }/* w w w .  jav a2  s  .  com*/

    JobConf jobConf = new JobConf(new Configuration());
    for (String key : workUnitState.getPropertyNames()) {
        jobConf.set(key, workUnitState.getProp(key));
    }

    String fileSplitBytesStr = workUnitState.getProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY);
    FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
    FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, jobConf);
    RecordReader<K, V> recordReader = fileInputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
    boolean readKeys = workUnitState.getPropAsBoolean(HadoopFileInputSource.FILE_INPUT_READ_KEYS_KEY,
            HadoopFileInputSource.DEFAULT_FILE_INPUT_READ_KEYS);
    return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
}

From source file:org.icgc.dcc.release.core.hadoop.CombineFileRecordReaderWrapper.java

License:Open Source License

protected CombineFileRecordReaderWrapper(FileInputFormat<K, V> inputFormat, CombineFileSplit split,
        Configuration conf, Reporter reporter, Integer index) throws IOException {
    val fileSplit = new FileSplit(split.getPath(index), split.getOffset(index), split.getLength(index),
            split.getLocations());//from   w  ww  .jav a 2 s  .c  o m

    delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter);
}

From source file:org.pooledtimeseries.cartesian.CartesianRecordReader.java

License:Apache License

/**
 * Creates a new instance of the CartesianRecordReader
 * /* www .  j a  v  a  2s . com*/
 * @param split
 * @param conf
 * @param reporter
 * @throws IOException
 */
public CartesianRecordReader(CompositeInputSplit split, JobConf conf, Reporter reporter) throws IOException {
    this.rightConf = conf;
    this.rightIS = split.get(1);
    this.rightReporter = reporter;

    try {
        // Create left record reader
        FileInputFormat<Text, BytesWritable> leftFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.LEFT_INPUT_FORMAT)), conf);

        leftRR = leftFIF.getRecordReader(split.get(0), conf, reporter);

        // Create right record reader
        rightFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.RIGHT_INPUT_FORMAT)), conf);

        rightRR = rightFIF.getRecordReader(rightIS, rightConf, rightReporter);
    } catch (ClassNotFoundException e) {

        e.printStackTrace();
        throw new IOException(e);
    }

    // Create key value pairs for parsing
    lkey = (K1) this.leftRR.createKey();
    lvalue = (V1) this.leftRR.createValue();

    rkey = (K2) this.rightRR.createKey();
    rvalue = (V2) this.rightRR.createValue();
}

From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java

License:Apache License

private static List<String> collect(FileInputFormat<Text, Text> format, JobConf job, int n, Reporter reporter)
        throws IOException {
    List<String> found = new ArrayList<String>();
    for (InputSplit split : format.getSplits(job, n)) {
        RecordReader<Text, Text> reader = format.getRecordReader(split, job, reporter);
        Text key = reader.createKey();
        Text value = reader.createValue();
        try {//from  ww w  .  j  av  a2  s .c om
            while (reader.next(key, value)) {
                found.add(key.toString());
            }
        } finally {
            reader.close();
        }
    }
    return found;
}

From source file:parquet.hive.TestDeprecatedParquetInputFormat.java

License:Apache License

private void readParquetHiveInputFormat(final String schemaRequested, final Integer[] arrCheckIndexValues)
        throws Exception {
    final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath()));
    final MessageType schema = readFooter.getFileMetaData().getSchema();

    long size = 0;
    final List<BlockMetaData> blocks = readFooter.getBlocks();
    for (final BlockMetaData block : blocks) {
        size += block.getTotalByteSize();
    }//from   w  w  w . ja v  a  2  s.  c om

    final FileInputFormat<Void, ArrayWritable> format = new DeprecatedParquetInputFormat();
    final String[] locations = new String[] { "localhost" };
    final String schemaToString = schema.toString();
    System.out.println(schemaToString);

    final String specificSchema = schemaRequested == null ? schemaToString : schemaRequested;

    // Set the configuration parameters
    final String columnsStr = "message customer {\n" + "  optional int32 c_custkey;\n"
            + "  optional binary c_name;\n" + "  optional binary c_address;\n"
            + "  optional int32 c_nationkey;\n" + "  optional binary c_phone;\n"
            + "  optional double c_acctbal;\n" + "  optional binary c_mktsegment;\n"
            + "  optional binary c_comment;\n" + "  optional group c_map (MAP_KEY_VALUE) {\n"
            + "    repeated group map {\n" + "      required binary key;\n" + "      optional binary value;\n"
            + "    }\n" + "  }\n" + "  optional group c_list (LIST) {\n" + "    repeated group bag {\n"
            + "      optional int32 array_element;\n" + "    }\n" + "  }\n" + "  optional int32 unknown;\n"
            + "}";

    final Map<String, String> readSupportMetaData = new HashMap<String, String>();
    readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, columnsStr);
    final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size,
            locations, blocks, schemaToString, specificSchema,
            readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData);

    final DeprecatedParquetInputFormat.InputSplitWrapper splitWrapper = new InputSplitWrapper(realSplit);

    // construct the record reader
    final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter);

    // create key/value
    final Void key = reader.createKey();
    final ArrayWritable value = reader.createValue();

    int count = 0;
    final int sizeExpected = mapData.size();
    while (reader.next(key, value)) {
        assertTrue(count < sizeExpected);
        assertTrue(key == null);
        final Writable[] arrValue = value.get();
        final ArrayWritable expected = mapData.get(((IntWritable) arrValue[0]).get());
        final Writable[] arrExpected = expected.get();
        assertEquals(arrValue.length, arrExpected.length);

        final boolean deepEquals = UtilitiesTestMethods.smartCheckArray(arrValue, arrExpected,
                arrCheckIndexValues);

        assertTrue(deepEquals);
        count++;
    }
    System.out.println("nb lines " + count);
    reader.close();

    assertEquals("Number of lines found and data written don't match", count, sizeExpected);
}

From source file:parquet.hive.TestDeprecatedParquetOuputFormat.java

License:Apache License

private void checkWrite() throws IOException, InterruptedException {
    final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath()));
    final MessageType schema = readFooter.getFileMetaData().getSchema();

    long size = 0;
    final List<BlockMetaData> blocks = readFooter.getBlocks();
    for (final BlockMetaData block : blocks) {
        size += block.getTotalByteSize();
    }//from  www . ja  v  a 2 s.  c  o m

    final FileInputFormat<Void, ArrayWritable> format = new DeprecatedParquetInputFormat();
    final String[] locations = new String[] { "localhost" };
    final String schemaToString = schema.toString();
    final String columnsStr = "message customer {\n" + "  optional int32 c_custkey;\n"
            + "  optional binary c_name;\n" + "  optional binary c_address;\n"
            + "  optional int32 c_nationkey;\n" + "  optional binary c_phone;\n"
            + "  optional double c_acctbal;\n" + "  optional binary c_mktsegment;\n"
            + "  optional binary c_comment;\n" + "  optional group c_map (MAP_KEY_VALUE) {\n"
            + "    repeated group map {\n" + "      required binary key;\n" + "      optional binary value;\n"
            + "    }\n" + "  }\n" + "  optional group c_list (LIST) {\n" + "    repeated group bag {\n"
            + "      optional int32 array_element;\n" + "    }\n" + "  }\n" + "}";

    final Map<String, String> readSupportMetaData = new HashMap<String, String>();
    readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, columnsStr);
    final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size,
            locations, blocks, schemaToString, schemaToString,
            readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData);

    final DeprecatedParquetInputFormat.InputSplitWrapper splitWrapper = new DeprecatedParquetInputFormat.InputSplitWrapper(
            realSplit);

    // construct the record reader
    final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter);

    // create key/value
    final Void key = reader.createKey();
    final ArrayWritable value = reader.createValue();

    int count = 0;
    while (reader.next(key, value)) {
        assertTrue(count < mapData.size());
        assertTrue(key == null);
        final Writable[] arrValue = value.get();
        final Writable[] writableArr = arrValue;
        final ArrayWritable expected = mapData.get(((IntWritable) writableArr[0]).get());
        final Writable[] arrExpected = expected.get();
        assertEquals(arrValue.length, 10);

        final boolean deepEquals = UtilitiesTestMethods.smartCheckArray(arrValue, arrExpected,
                new Integer[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });

        assertTrue(deepEquals);
        count++;
    }
    reader.close();

    assertEquals("Number of lines found and data written don't match", count, mapData.size());

}

From source file:parquet.hive.TestMapredParquetInputFormat.java

License:Apache License

@Test
public void testGetSplit() throws Exception {
    final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath()));

    final MessageType fileSchema = readFooter.getFileMetaData().getSchema();
    final MessageType requestedSchema = MessageTypeParser.parseMessageType("message customer {\n"
            + "  optional int32 c_custkey;\n" + "  optional binary c_name;\n" + "  optional double c_acctbal;\n"
            + "  optional binary c_mktsegment;\n" + "  optional binary c_comment;\n" + "}");
    final MessageType hiveSchema = MessageTypeParser.parseMessageType("message customer {\n"
            + "  optional int32 c_custkey;\n" + "  optional binary c_name;\n" + "  optional binary c_address;\n"
            + "  optional int32 c_nationkey;\n" + "  optional binary c_phone;\n"
            + "  optional double c_acctbal;\n" + "  optional binary c_mktsegment;\n"
            + "  optional binary c_comment;\n" + "  optional group c_map (MAP_KEY_VALUE) {\n"
            + "    repeated group map {\n" + "      required binary key;\n" + "      optional binary value;\n"
            + "    }\n" + "  }\n" + "  optional group c_list (LIST) {\n" + "    repeated group bag {\n"
            + "      optional int32 array_element;\n" + "    }\n" + "  }\n" + "  optional binary unknown;\n"
            + "}");

    // Put columns and projection info in the conf
    List<String> columns = new ArrayList<String>();
    List<Integer> readColumns = new ArrayList<Integer>();
    for (int i = 0; i < hiveSchema.getFieldCount(); ++i) {
        final String name = hiveSchema.getType(i).getName();
        columns.add(name);/*ww  w . j av  a  2 s .  c o m*/
        if (requestedSchema.containsField(name)) {
            readColumns.add(i);
        }
    }
    job.set("columns", StringUtils.join(columns, ","));
    ColumnProjectionUtils.setReadColumnIDs(job, readColumns);

    long size = 0;
    final List<BlockMetaData> blocks = readFooter.getBlocks();
    for (final BlockMetaData block : blocks) {
        size += block.getTotalByteSize();
    }

    final FileInputFormat<Void, ArrayWritable> format = new MapredParquetInputFormat();
    final String[] locations = new String[] { "localhost" };

    final Map<String, String> readSupportMetaData = new HashMap<String, String>();
    readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, hiveSchema.toString());
    final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size,
            locations, blocks, fileSchema.toString(), requestedSchema.toString(),
            readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData);

    final MapredParquetInputFormat.InputSplitWrapper splitWrapper = new InputSplitWrapper(realSplit);

    // construct the record reader
    final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter);

    assertEquals("Wrong real split inside wrapper", realSplit,
            ((MapredParquetInputFormat.RecordReaderWrapper) reader).getSplit(splitWrapper, job));

    // Recreate the split using getSplit, as Hive would
    final FileSplit fileSplit = new FileSplit(splitWrapper.getPath(), splitWrapper.getStart(),
            splitWrapper.getLength(), splitWrapper.getLocations());
    final ParquetInputSplit recreatedSplit = ((MapredParquetInputFormat.RecordReaderWrapper) reader)
            .getSplit(fileSplit, job);
    assertTrue("Wrong file schema", UtilitiesTestMethods.smartCheckSchema(fileSchema,
            MessageTypeParser.parseMessageType(recreatedSplit.getFileSchema())));
    assertTrue("Wrong requested schema", UtilitiesTestMethods.smartCheckSchema(requestedSchema,
            MessageTypeParser.parseMessageType(recreatedSplit.getRequestedSchema())));
    assertTrue("Wrong hive schema",
            UtilitiesTestMethods.smartCheckSchema(hiveSchema, MessageTypeParser.parseMessageType(
                    recreatedSplit.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY))));
}