Example usage for org.apache.hadoop.mapred FileInputFormat getRecordReader

List of usage examples for org.apache.hadoop.mapred FileInputFormat getRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred FileInputFormat getRecordReader.

Prototype

public abstract RecordReader<K, V> getRecordReader(InputSplit split, JobConf job, Reporter reporter)
            throws IOException;

Source Link

Usage

From source file:cascading.tap.hadoop.io.CombineFileRecordReaderWrapper.java

License:Open Source License

public CombineFileRecordReaderWrapper(CombineFileSplit split, Configuration conf, Reporter reporter,
        Integer idx) throws Exception {
    FileSplit fileSplit = new FileSplit(split.getPath(idx), split.getOffset(idx), split.getLength(idx),
            split.getLocations());//from w  w w .  j  a v  a  2  s.c om

    Class<?> clz = conf.getClass(INDIVIDUAL_INPUT_FORMAT, null);
    FileInputFormat<K, V> inputFormat = (FileInputFormat<K, V>) clz.newInstance();

    if (inputFormat instanceof Configurable)
        ((Configurable) inputFormat).setConf(conf);

    delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter);
}

From source file:com.hdfs.concat.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = (Class<? extends FileInputFormat<?, ?>>) inFormatClsList
            .get(idx);//w ww .  j a v  a2s.c om

    try {
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:com.m6d.filecrush.crush.CrushReducer.java

License:Apache License

@SuppressWarnings("unchecked")
private RecordReader<Object, Object> createRecordReader(int idx, Path inputPath, Reporter reporter)
        throws IOException {

    LOG.info(format("Opening '%s'", inputPath));

    Class<? extends FileInputFormat<?, ?>> cls = getInputFormatClass(idx);

    try {/*w w w. j a v  a2  s.c o m*/
        FileInputFormat.setInputPaths(job, inputPath);

        FileInputFormat<?, ?> instance = cls.newInstance();

        if (instance instanceof JobConfigurable) {
            ((JobConfigurable) instance).configure(job);
        }

        InputSplit[] splits = instance.getSplits(job, 1);

        if (1 != splits.length) {
            throw new IllegalArgumentException("Could not get input splits: " + inputPath);
        }

        return (RecordReader<Object, Object>) instance.getRecordReader(splits[0], job, reporter);
    } catch (RuntimeException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}

From source file:gobblin.source.extractor.hadoop.OldApiHadoopFileInputSource.java

License:Apache License

@Override
public Extractor<S, D> getExtractor(WorkUnitState workUnitState) throws IOException {
    if (!workUnitState.contains(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY)) {
        throw new IOException("No serialized FileSplit found in WorkUnitState " + workUnitState.getId());
    }/* w w w .  jav a2  s  .  com*/

    JobConf jobConf = new JobConf(new Configuration());
    for (String key : workUnitState.getPropertyNames()) {
        jobConf.set(key, workUnitState.getProp(key));
    }

    String fileSplitBytesStr = workUnitState.getProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY);
    FileSplit fileSplit = (FileSplit) HadoopUtils.deserializeFromString(FileSplit.class, fileSplitBytesStr);
    FileInputFormat<K, V> fileInputFormat = getFileInputFormat(workUnitState, jobConf);
    RecordReader<K, V> recordReader = fileInputFormat.getRecordReader(fileSplit, jobConf, Reporter.NULL);
    boolean readKeys = workUnitState.getPropAsBoolean(HadoopFileInputSource.FILE_INPUT_READ_KEYS_KEY,
            HadoopFileInputSource.DEFAULT_FILE_INPUT_READ_KEYS);
    return getExtractor(workUnitState, recordReader, fileSplit, readKeys);
}

From source file:org.icgc.dcc.release.core.hadoop.CombineFileRecordReaderWrapper.java

License:Open Source License

protected CombineFileRecordReaderWrapper(FileInputFormat<K, V> inputFormat, CombineFileSplit split,
        Configuration conf, Reporter reporter, Integer index) throws IOException {
    val fileSplit = new FileSplit(split.getPath(index), split.getOffset(index), split.getLength(index),
            split.getLocations());//from   w  ww  .jav a 2 s  .c  o m

    delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter);
}

From source file:org.pooledtimeseries.cartesian.CartesianRecordReader.java

License:Apache License

/**
 * Creates a new instance of the CartesianRecordReader
 * /* www .  j a  v  a  2s . com*/
 * @param split
 * @param conf
 * @param reporter
 * @throws IOException
 */
public CartesianRecordReader(CompositeInputSplit split, JobConf conf, Reporter reporter) throws IOException {
    this.rightConf = conf;
    this.rightIS = split.get(1);
    this.rightReporter = reporter;

    try {
        // Create left record reader
        FileInputFormat<Text, BytesWritable> leftFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.LEFT_INPUT_FORMAT)), conf);

        leftRR = leftFIF.getRecordReader(split.get(0), conf, reporter);

        // Create right record reader
        rightFIF = (FileInputFormat) ReflectionUtils
                .newInstance(Class.forName(conf.get(CartesianInputFormat.RIGHT_INPUT_FORMAT)), conf);

        rightRR = rightFIF.getRecordReader(rightIS, rightConf, rightReporter);
    } catch (ClassNotFoundException e) {

        e.printStackTrace();
        throw new IOException(e);
    }

    // Create key value pairs for parsing
    lkey = (K1) this.leftRR.createKey();
    lvalue = (V1) this.leftRR.createValue();

    rkey = (K2) this.rightRR.createKey();
    rvalue = (V2) this.rightRR.createValue();
}

From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java

License:Apache License

private static List<String> collect(FileInputFormat<Text, Text> format, JobConf job, int n, Reporter reporter)
        throws IOException {
    List<String> found = new ArrayList<String>();
    for (InputSplit split : format.getSplits(job, n)) {
        RecordReader<Text, Text> reader = format.getRecordReader(split, job, reporter);
        Text key = reader.createKey();
        Text value = reader.createValue();
        try {//from  ww w  .  j  av  a2  s .c om
            while (reader.next(key, value)) {
                found.add(key.toString());
            }
        } finally {
            reader.close();
        }
    }
    return found;
}

From source file:parquet.hive.TestDeprecatedParquetInputFormat.java

License:Apache License

private void readParquetHiveInputFormat(final String schemaRequested, final Integer[] arrCheckIndexValues)
        throws Exception {
    final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath()));
    final MessageType schema = readFooter.getFileMetaData().getSchema();

    long size = 0;
    final List<BlockMetaData> blocks = readFooter.getBlocks();
    for (final BlockMetaData block : blocks) {
        size += block.getTotalByteSize();
    }//from   w  w  w . ja v  a  2  s.  c om

    final FileInputFormat<Void, ArrayWritable> format = new DeprecatedParquetInputFormat();
    final String[] locations = new String[] { "localhost" };
    final String schemaToString = schema.toString();
    System.out.println(schemaToString);

    final String specificSchema = schemaRequested == null ? schemaToString : schemaRequested;

    // Set the configuration parameters
    final String columnsStr = "message customer {\n" + "  optional int32 c_custkey;\n"
            + "  optional binary c_name;\n" + "  optional binary c_address;\n"
            + "  optional int32 c_nationkey;\n" + "  optional binary c_phone;\n"
            + "  optional double c_acctbal;\n" + "  optional binary c_mktsegment;\n"
            + "  optional binary c_comment;\n" + "  optional group c_map (MAP_KEY_VALUE) {\n"
            + "    repeated group map {\n" + "      required binary key;\n" + "      optional binary value;\n"
            + "    }\n" + "  }\n" + "  optional group c_list (LIST) {\n" + "    repeated group bag {\n"
            + "      optional int32 array_element;\n" + "    }\n" + "  }\n" + "  optional int32 unknown;\n"
            + "}";

    final Map<String, String> readSupportMetaData = new HashMap<String, String>();
    readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, columnsStr);
    final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size,
            locations, blocks, schemaToString, specificSchema,
            readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData);

    final DeprecatedParquetInputFormat.InputSplitWrapper splitWrapper = new InputSplitWrapper(realSplit);

    // construct the record reader
    final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter);

    // create key/value
    final Void key = reader.createKey();
    final ArrayWritable value = reader.createValue();

    int count = 0;
    final int sizeExpected = mapData.size();
    while (reader.next(key, value)) {
        assertTrue(count < sizeExpected);
        assertTrue(key == null);
        final Writable[] arrValue = value.get();
        final ArrayWritable expected = mapData.get(((IntWritable) arrValue[0]).get());
        final Writable[] arrExpected = expected.get();
        assertEquals(arrValue.length, arrExpected.length);

        final boolean deepEquals = UtilitiesTestMethods.smartCheckArray(arrValue, arrExpected,
                arrCheckIndexValues);

        assertTrue(deepEquals);
        count++;
    }
    System.out.println("nb lines " + count);
    reader.close();

    assertEquals("Number of lines found and data written don't match", count, sizeExpected);
}

From source file:parquet.hive.TestDeprecatedParquetOuputFormat.java

License:Apache License

private void checkWrite() throws IOException, InterruptedException {
    final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath()));
    final MessageType schema = readFooter.getFileMetaData().getSchema();

    long size = 0;
    final List<BlockMetaData> blocks = readFooter.getBlocks();
    for (final BlockMetaData block : blocks) {
        size += block.getTotalByteSize();
    }//from  www . ja  v  a 2 s.  c  o m

    final FileInputFormat<Void, ArrayWritable> format = new DeprecatedParquetInputFormat();
    final String[] locations = new String[] { "localhost" };
    final String schemaToString = schema.toString();
    final String columnsStr = "message customer {\n" + "  optional int32 c_custkey;\n"
            + "  optional binary c_name;\n" + "  optional binary c_address;\n"
            + "  optional int32 c_nationkey;\n" + "  optional binary c_phone;\n"
            + "  optional double c_acctbal;\n" + "  optional binary c_mktsegment;\n"
            + "  optional binary c_comment;\n" + "  optional group c_map (MAP_KEY_VALUE) {\n"
            + "    repeated group map {\n" + "      required binary key;\n" + "      optional binary value;\n"
            + "    }\n" + "  }\n" + "  optional group c_list (LIST) {\n" + "    repeated group bag {\n"
            + "      optional int32 array_element;\n" + "    }\n" + "  }\n" + "}";

    final Map<String, String> readSupportMetaData = new HashMap<String, String>();
    readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, columnsStr);
    final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size,
            locations, blocks, schemaToString, schemaToString,
            readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData);

    final DeprecatedParquetInputFormat.InputSplitWrapper splitWrapper = new DeprecatedParquetInputFormat.InputSplitWrapper(
            realSplit);

    // construct the record reader
    final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter);

    // create key/value
    final Void key = reader.createKey();
    final ArrayWritable value = reader.createValue();

    int count = 0;
    while (reader.next(key, value)) {
        assertTrue(count < mapData.size());
        assertTrue(key == null);
        final Writable[] arrValue = value.get();
        final Writable[] writableArr = arrValue;
        final ArrayWritable expected = mapData.get(((IntWritable) writableArr[0]).get());
        final Writable[] arrExpected = expected.get();
        assertEquals(arrValue.length, 10);

        final boolean deepEquals = UtilitiesTestMethods.smartCheckArray(arrValue, arrExpected,
                new Integer[] { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 });

        assertTrue(deepEquals);
        count++;
    }
    reader.close();

    assertEquals("Number of lines found and data written don't match", count, mapData.size());

}

From source file:parquet.hive.TestMapredParquetInputFormat.java

License:Apache License

@Test
public void testGetSplit() throws Exception {
    final ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, new Path(testFile.getAbsolutePath()));

    final MessageType fileSchema = readFooter.getFileMetaData().getSchema();
    final MessageType requestedSchema = MessageTypeParser.parseMessageType("message customer {\n"
            + "  optional int32 c_custkey;\n" + "  optional binary c_name;\n" + "  optional double c_acctbal;\n"
            + "  optional binary c_mktsegment;\n" + "  optional binary c_comment;\n" + "}");
    final MessageType hiveSchema = MessageTypeParser.parseMessageType("message customer {\n"
            + "  optional int32 c_custkey;\n" + "  optional binary c_name;\n" + "  optional binary c_address;\n"
            + "  optional int32 c_nationkey;\n" + "  optional binary c_phone;\n"
            + "  optional double c_acctbal;\n" + "  optional binary c_mktsegment;\n"
            + "  optional binary c_comment;\n" + "  optional group c_map (MAP_KEY_VALUE) {\n"
            + "    repeated group map {\n" + "      required binary key;\n" + "      optional binary value;\n"
            + "    }\n" + "  }\n" + "  optional group c_list (LIST) {\n" + "    repeated group bag {\n"
            + "      optional int32 array_element;\n" + "    }\n" + "  }\n" + "  optional binary unknown;\n"
            + "}");

    // Put columns and projection info in the conf
    List<String> columns = new ArrayList<String>();
    List<Integer> readColumns = new ArrayList<Integer>();
    for (int i = 0; i < hiveSchema.getFieldCount(); ++i) {
        final String name = hiveSchema.getType(i).getName();
        columns.add(name);/*ww  w . j av  a  2 s .  c o m*/
        if (requestedSchema.containsField(name)) {
            readColumns.add(i);
        }
    }
    job.set("columns", StringUtils.join(columns, ","));
    ColumnProjectionUtils.setReadColumnIDs(job, readColumns);

    long size = 0;
    final List<BlockMetaData> blocks = readFooter.getBlocks();
    for (final BlockMetaData block : blocks) {
        size += block.getTotalByteSize();
    }

    final FileInputFormat<Void, ArrayWritable> format = new MapredParquetInputFormat();
    final String[] locations = new String[] { "localhost" };

    final Map<String, String> readSupportMetaData = new HashMap<String, String>();
    readSupportMetaData.put(DataWritableReadSupport.HIVE_SCHEMA_KEY, hiveSchema.toString());
    final ParquetInputSplit realSplit = new ParquetInputSplit(new Path(testFile.getAbsolutePath()), 0, size,
            locations, blocks, fileSchema.toString(), requestedSchema.toString(),
            readFooter.getFileMetaData().getKeyValueMetaData(), readSupportMetaData);

    final MapredParquetInputFormat.InputSplitWrapper splitWrapper = new InputSplitWrapper(realSplit);

    // construct the record reader
    final RecordReader<Void, ArrayWritable> reader = format.getRecordReader(splitWrapper, job, reporter);

    assertEquals("Wrong real split inside wrapper", realSplit,
            ((MapredParquetInputFormat.RecordReaderWrapper) reader).getSplit(splitWrapper, job));

    // Recreate the split using getSplit, as Hive would
    final FileSplit fileSplit = new FileSplit(splitWrapper.getPath(), splitWrapper.getStart(),
            splitWrapper.getLength(), splitWrapper.getLocations());
    final ParquetInputSplit recreatedSplit = ((MapredParquetInputFormat.RecordReaderWrapper) reader)
            .getSplit(fileSplit, job);
    assertTrue("Wrong file schema", UtilitiesTestMethods.smartCheckSchema(fileSchema,
            MessageTypeParser.parseMessageType(recreatedSplit.getFileSchema())));
    assertTrue("Wrong requested schema", UtilitiesTestMethods.smartCheckSchema(requestedSchema,
            MessageTypeParser.parseMessageType(recreatedSplit.getRequestedSchema())));
    assertTrue("Wrong hive schema",
            UtilitiesTestMethods.smartCheckSchema(hiveSchema, MessageTypeParser.parseMessageType(
                    recreatedSplit.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY))));
}