Example usage for org.apache.hadoop.mapreduce RecordReader initialize

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader initialize.

Prototype

public abstract void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Called once at initialization.

Usage

From source file:org.apache.avro.mapreduce.TestAvroKeyRecordReader.java

License:Apache License

/**
 * Verifies that avro records can be read and progress is reported correctly.
 *///from ww  w.j a  v  a2 s  . com
@Test
public void testReadRecords() throws IOException, InterruptedException {
    // Create the test avro file input with two records:
    //   1. "first"
    //   2. "second"
    final SeekableInput avroFileInput = new SeekableFileInput(
            AvroFiles.createFile(new File(mTempDir.getRoot(), "myStringfile.avro"),
                    Schema.create(Schema.Type.STRING), "first", "second"));

    // Create the record reader.
    Schema readerSchema = Schema.create(Schema.Type.STRING);
    RecordReader<AvroKey<CharSequence>, NullWritable> recordReader = new AvroKeyRecordReader<CharSequence>(
            readerSchema) {
        @Override
        protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException {
            return avroFileInput;
        }
    };

    // Set up the job configuration.
    Configuration conf = new Configuration();

    // Create a mock input split for this record reader.
    FileSplit inputSplit = createMock(FileSplit.class);
    expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes();
    expect(inputSplit.getStart()).andReturn(0L).anyTimes();
    expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes();

    // Create a mock task attempt context for this record reader.
    TaskAttemptContext context = createMock(TaskAttemptContext.class);
    expect(context.getConfiguration()).andReturn(conf).anyTimes();

    // Initialize the record reader.
    replay(inputSplit);
    replay(context);
    recordReader.initialize(inputSplit, context);

    assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f);

    // Some variables to hold the records.
    AvroKey<CharSequence> key;
    NullWritable value;

    // Read the first record.
    assertTrue("Expected at least one record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("First record had null key", key);
    assertNotNull("First record had null value", value);

    CharSequence firstString = key.datum();
    assertEquals("first", firstString.toString());

    assertTrue("getCurrentKey() returned different keys for the same record",
            key == recordReader.getCurrentKey());
    assertTrue("getCurrentValue() returned different values for the same record",
            value == recordReader.getCurrentValue());

    // Read the second record.
    assertTrue("Expected to read a second record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("Second record had null key", key);
    assertNotNull("Second record had null value", value);

    CharSequence secondString = key.datum();
    assertEquals("second", secondString.toString());

    assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(),
            0.0f);

    // There should be no more records.
    assertFalse("Expected only 2 records", recordReader.nextKeyValue());

    // Close the record reader.
    recordReader.close();

    // Verify the expected calls on the mocks.
    verify(inputSplit);
    verify(context);
}

From source file:org.apache.avro.mapreduce.TestAvroKeyValueRecordReader.java

License:Apache License

/**
 * Verifies that avro records can be read and progress is reported correctly.
 *//*from w w  w.  j av a  2s .c  o m*/
@Test
public void testReadRecords() throws IOException, InterruptedException {
    // Create the test avro file input with two records:
    //   1. <"firstkey", 1>
    //   2. <"second", 2>
    Schema keyValueSchema = AvroKeyValue.getSchema(Schema.create(Schema.Type.STRING),
            Schema.create(Schema.Type.INT));

    AvroKeyValue<CharSequence, Integer> firstInputRecord = new AvroKeyValue<CharSequence, Integer>(
            new GenericData.Record(keyValueSchema));
    firstInputRecord.setKey("first");
    firstInputRecord.setValue(1);

    AvroKeyValue<CharSequence, Integer> secondInputRecord = new AvroKeyValue<CharSequence, Integer>(
            new GenericData.Record(keyValueSchema));
    secondInputRecord.setKey("second");
    secondInputRecord.setValue(2);

    final SeekableInput avroFileInput = new SeekableFileInput(
            AvroFiles.createFile(new File(mTempDir.getRoot(), "myInputFile.avro"), keyValueSchema,
                    firstInputRecord.get(), secondInputRecord.get()));

    // Create the record reader over the avro input file.
    RecordReader<AvroKey<CharSequence>, AvroValue<Integer>> recordReader = new AvroKeyValueRecordReader<CharSequence, Integer>(
            Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)) {
        @Override
        protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException {
            return avroFileInput;
        }
    };

    // Set up the job configuration.
    Configuration conf = new Configuration();

    // Create a mock input split for this record reader.
    FileSplit inputSplit = createMock(FileSplit.class);
    expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes();
    expect(inputSplit.getStart()).andReturn(0L).anyTimes();
    expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes();

    // Create a mock task attempt context for this record reader.
    TaskAttemptContext context = createMock(TaskAttemptContext.class);
    expect(context.getConfiguration()).andReturn(conf).anyTimes();

    // Initialize the record reader.
    replay(inputSplit);
    replay(context);
    recordReader.initialize(inputSplit, context);

    assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f);

    // Some variables to hold the records.
    AvroKey<CharSequence> key;
    AvroValue<Integer> value;

    // Read the first record.
    assertTrue("Expected at least one record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("First record had null key", key);
    assertNotNull("First record had null value", value);

    assertEquals("first", key.datum().toString());
    assertEquals(1, value.datum().intValue());

    assertTrue("getCurrentKey() returned different keys for the same record",
            key == recordReader.getCurrentKey());
    assertTrue("getCurrentValue() returned different values for the same record",
            value == recordReader.getCurrentValue());

    // Read the second record.
    assertTrue("Expected to read a second record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("Second record had null key", key);
    assertNotNull("Second record had null value", value);

    assertEquals("second", key.datum().toString());
    assertEquals(2, value.datum().intValue());

    assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(),
            0.0f);

    // There should be no more records.
    assertFalse("Expected only 2 records", recordReader.nextKeyValue());

    // Close the record reader.
    recordReader.close();

    // Verify the expected calls on the mocks.
    verify(inputSplit);
    verify(context);
}

From source file:org.apache.carbondata.processing.loading.jsoninput.JsonInputFormat.java

License:Apache License

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException {
    RecordReader<LongWritable, Text> rdr;

    if (JsonInputFormat.getOneRecordPerLine(context.getConfiguration())) {
        rdr = new SimpleJsonRecordReader();
    } else {/*from   w  w  w  . ja  va2s.  co  m*/
        return new JsonRecordReader();
    }
    rdr.initialize(split, context);
    return rdr;
}

From source file:org.apache.carbondata.sdk.file.CarbonReaderBuilder.java

License:Apache License

/**
 * Build CarbonReader//ww  w. j  a  v  a 2  s  .  co  m
 *
 * @param <T>
 * @return CarbonReader
 * @throws IOException
 * @throws InterruptedException
 */
public <T> CarbonReader<T> build() throws IOException, InterruptedException {
    if (hadoopConf == null) {
        hadoopConf = FileFactory.getConfiguration();
    }
    CarbonTable table;
    // now always infer schema. TODO:Refactor in next version.
    table = CarbonTable.buildTable(tablePath, tableName, hadoopConf);
    final CarbonFileInputFormat format = new CarbonFileInputFormat();
    final Job job = new Job(hadoopConf);
    format.setTableInfo(job.getConfiguration(), table.getTableInfo());
    format.setTablePath(job.getConfiguration(), table.getTablePath());
    format.setTableName(job.getConfiguration(), table.getTableName());
    format.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
    if (filterExpression != null) {
        format.setFilterPredicates(job.getConfiguration(), filterExpression);
    }

    if (projectionColumns != null) {
        // set the user projection
        int len = projectionColumns.length;
        //      TODO : Handle projection of complex child columns
        for (int i = 0; i < len; i++) {
            if (projectionColumns[i].contains(".")) {
                throw new UnsupportedOperationException(
                        "Complex child columns projection NOT supported through CarbonReader");
            }
        }
        format.setColumnProjection(job.getConfiguration(), projectionColumns);
    }

    try {

        if (filterExpression == null) {
            job.getConfiguration().set("filter_blocks", "false");
        }
        List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));
        List<RecordReader<Void, T>> readers = new ArrayList<>(splits.size());
        for (InputSplit split : splits) {
            TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader reader;
            QueryModel queryModel = format.createQueryModel(split, attempt);
            boolean hasComplex = false;
            for (ProjectionDimension projectionDimension : queryModel.getProjectionDimensions()) {
                if (projectionDimension.getDimension().isComplex()) {
                    hasComplex = true;
                    break;
                }
            }
            if (useVectorReader && !hasComplex) {
                queryModel.setDirectVectorFill(filterExpression == null);
                reader = new CarbonVectorizedRecordReader(queryModel);
            } else {
                reader = format.createRecordReader(split, attempt);
            }
            try {
                reader.initialize(split, attempt);
                readers.add(reader);
            } catch (Exception e) {
                CarbonUtil.closeStreams(readers.toArray(new RecordReader[0]));
                throw e;
            }
        }
        return new CarbonReader<>(readers);
    } catch (Exception ex) {
        // Clear the datamap cache as it can get added in getSplits() method
        DataMapStoreManager.getInstance().clearDataMaps(table.getAbsoluteTableIdentifier());
        throw ex;
    }
}

From source file:org.apache.carbondata.store.LocalCarbonStore.java

License:Apache License

@Override
public Iterator<CarbonRow> scan(AbsoluteTableIdentifier tableIdentifier, String[] projectColumns,
        Expression filter) throws IOException {
    Objects.requireNonNull(tableIdentifier);
    Objects.requireNonNull(projectColumns);

    CarbonTable table = getTable(tableIdentifier.getTablePath());
    if (table.isStreamingSink() || table.isHivePartitionTable()) {
        throw new UnsupportedOperationException("streaming and partition table is not supported");
    }/*from   w w  w .  j a  v a  2 s. c o m*/
    // TODO: use InputFormat to prune data and read data

    final CarbonTableInputFormat format = new CarbonTableInputFormat();
    final Job job = new Job(new Configuration());
    CarbonInputFormat.setTableInfo(job.getConfiguration(), table.getTableInfo());
    CarbonInputFormat.setTablePath(job.getConfiguration(), table.getTablePath());
    CarbonInputFormat.setTableName(job.getConfiguration(), table.getTableName());
    CarbonInputFormat.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
    CarbonInputFormat.setCarbonReadSupport(job.getConfiguration(), CarbonRowReadSupport.class);
    CarbonInputFormat.setColumnProjection(job.getConfiguration(), new CarbonProjection(projectColumns));
    if (filter != null) {
        CarbonInputFormat.setFilterPredicates(job.getConfiguration(), filter);
    }

    final List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));

    List<RecordReader<Void, Object>> readers = new ArrayList<>(splits.size());

    List<CarbonRow> rows = new ArrayList<>();

    try {
        for (InputSplit split : splits) {
            TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader reader = format.createRecordReader(split, attempt);
            reader.initialize(split, attempt);
            readers.add(reader);
        }

        for (RecordReader<Void, Object> reader : readers) {
            while (reader.nextKeyValue()) {
                rows.add((CarbonRow) reader.getCurrentValue());
            }
            try {
                reader.close();
            } catch (IOException e) {
                LOGGER.error(e);
            }
        }
    } catch (InterruptedException e) {
        throw new IOException(e);
    } finally {
        for (RecordReader<Void, Object> reader : readers) {
            try {
                reader.close();
            } catch (IOException e) {
                LOGGER.error(e);
            }
        }
    }
    return rows.iterator();
}

From source file:org.apache.crunch.io.hcatalog.HCatRecordDataIterable.java

License:Apache License

@Override
public Iterator<HCatRecord> iterator() {
    try {/* w ww .j av  a2 s.  co m*/
        Job job = Job.getInstance(bundle.configure(conf));

        final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf);
        final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID());

        return Iterators
                .concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<HCatRecord>>() {

                    @Override
                    public Iterator<HCatRecord> apply(InputSplit split) {
                        RecordReader reader = null;
                        try {
                            reader = fmt.createRecordReader(split, ctxt);
                            reader.initialize(split, ctxt);
                        } catch (IOException | InterruptedException e) {
                            throw new CrunchRuntimeException(e);
                        }
                        return new HCatRecordReaderIterator(reader);
                    }
                }).iterator());
    } catch (Exception e) {
        throw new CrunchRuntimeException(e);
    }
}

From source file:org.apache.crunch.io.impl.DefaultFileReaderFactory.java

License:Apache License

@Override
public Iterator<T> read(FileSystem fs, Path path) {
    final Configuration conf = new Configuration(fs.getConf());
    bundle.configure(conf);/*from w  w w .  j  a v a  2 s .  c  o  m*/
    ptype.initialize(conf);

    final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf);
    final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    try {
        Job job = new Job(conf);
        FileInputFormat.addInputPath(job, path);
        return Iterators.concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<T>>() {
            @Override
            public Iterator<T> apply(InputSplit split) {
                try {
                    RecordReader reader = fmt.createRecordReader(split, ctxt);
                    reader.initialize(split, ctxt);
                    return new RecordReaderIterator<T>(reader, ptype);
                } catch (Exception e) {
                    LOG.error("Error reading split: " + split, e);
                    throw new CrunchRuntimeException(e);
                }
            }
        }).iterator());
    } catch (Exception e) {
        LOG.error("Error reading path: " + path, e);
        throw new CrunchRuntimeException(e);
    }
}

From source file:org.apache.crunch.kafka.inputformat.KafkaInputFormatIT.java

License:Apache License

@Test
public void getSplitsCreateReaders() throws IOException, InterruptedException {
    List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
    Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(),
            topic);/*from ww w  .  j  a v a  2s . c o m*/
    Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);

    Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
    for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
        Long endingOffset = endOffsets.get(entry.getKey());
        offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
    }

    KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);

    KafkaInputFormat inputFormat = new KafkaInputFormat();
    inputFormat.setConf(config);
    List<InputSplit> splits = inputFormat.getSplits(null);

    assertThat(splits.size(), is(offsets.size()));

    for (InputSplit split : splits) {
        KafkaInputSplit inputSplit = (KafkaInputSplit) split;
        Pair<Long, Long> startEnd = offsets.get(inputSplit.getTopicPartition());
        assertThat(inputSplit.getStartingOffset(), is(startEnd.first()));
        assertThat(inputSplit.getEndingOffset(), is(startEnd.second()));
    }

    //create readers and consume the data
    when(taskContext.getConfiguration()).thenReturn(config);
    Set<String> keysRead = new HashSet<>();
    //read all data from all splits
    for (InputSplit split : splits) {
        KafkaInputSplit inputSplit = (KafkaInputSplit) split;
        long start = inputSplit.getStartingOffset();
        long end = inputSplit.getEndingOffset();

        RecordReader<BytesWritable, BytesWritable> recordReader = inputFormat.createRecordReader(split,
                taskContext);
        recordReader.initialize(split, taskContext);

        int numRecordsFound = 0;
        String currentKey;
        while (recordReader.nextKeyValue()) {
            currentKey = new String(recordReader.getCurrentKey().getBytes());
            keysRead.add(currentKey);
            assertThat(keys, hasItem(currentKey));
            assertThat(recordReader.getCurrentValue(), is(notNullValue()));
            numRecordsFound++;
        }
        recordReader.close();

        //assert that it encountered a partitions worth of data
        assertThat(((long) numRecordsFound), is(end - start));
    }

    //validate the same number of unique keys was read as were written.
    assertThat(keysRead.size(), is(keys.size()));
}

From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testRead() throws IOException, InterruptedException {
    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser();

    reader.initialize(split, context);

    reader.nextKeyValue();/*from   w ww . jav a 2s. com*/

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}

From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testReadDateColumn() throws IOException, InterruptedException {
    File testFile2 = makeOrcFileWithDate();
    Path path = new Path(testFile2.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile2.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser();

    reader.initialize(split, context);

    reader.nextKeyValue();/*from  w ww  . j a va2 s .  co m*/

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}