List of usage examples for org.apache.hadoop.mapreduce RecordReader initialize
public abstract void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.apache.avro.mapreduce.TestAvroKeyRecordReader.java
License:Apache License
/** * Verifies that avro records can be read and progress is reported correctly. *///from ww w.j a v a2 s . com @Test public void testReadRecords() throws IOException, InterruptedException { // Create the test avro file input with two records: // 1. "first" // 2. "second" final SeekableInput avroFileInput = new SeekableFileInput( AvroFiles.createFile(new File(mTempDir.getRoot(), "myStringfile.avro"), Schema.create(Schema.Type.STRING), "first", "second")); // Create the record reader. Schema readerSchema = Schema.create(Schema.Type.STRING); RecordReader<AvroKey<CharSequence>, NullWritable> recordReader = new AvroKeyRecordReader<CharSequence>( readerSchema) { @Override protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return avroFileInput; } }; // Set up the job configuration. Configuration conf = new Configuration(); // Create a mock input split for this record reader. FileSplit inputSplit = createMock(FileSplit.class); expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes(); expect(inputSplit.getStart()).andReturn(0L).anyTimes(); expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes(); // Create a mock task attempt context for this record reader. TaskAttemptContext context = createMock(TaskAttemptContext.class); expect(context.getConfiguration()).andReturn(conf).anyTimes(); // Initialize the record reader. replay(inputSplit); replay(context); recordReader.initialize(inputSplit, context); assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f); // Some variables to hold the records. AvroKey<CharSequence> key; NullWritable value; // Read the first record. assertTrue("Expected at least one record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("First record had null key", key); assertNotNull("First record had null value", value); CharSequence firstString = key.datum(); assertEquals("first", firstString.toString()); assertTrue("getCurrentKey() returned different keys for the same record", key == recordReader.getCurrentKey()); assertTrue("getCurrentValue() returned different values for the same record", value == recordReader.getCurrentValue()); // Read the second record. assertTrue("Expected to read a second record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("Second record had null key", key); assertNotNull("Second record had null value", value); CharSequence secondString = key.datum(); assertEquals("second", secondString.toString()); assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(), 0.0f); // There should be no more records. assertFalse("Expected only 2 records", recordReader.nextKeyValue()); // Close the record reader. recordReader.close(); // Verify the expected calls on the mocks. verify(inputSplit); verify(context); }
From source file:org.apache.avro.mapreduce.TestAvroKeyValueRecordReader.java
License:Apache License
/** * Verifies that avro records can be read and progress is reported correctly. *//*from w w w. j av a 2s .c o m*/ @Test public void testReadRecords() throws IOException, InterruptedException { // Create the test avro file input with two records: // 1. <"firstkey", 1> // 2. <"second", 2> Schema keyValueSchema = AvroKeyValue.getSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)); AvroKeyValue<CharSequence, Integer> firstInputRecord = new AvroKeyValue<CharSequence, Integer>( new GenericData.Record(keyValueSchema)); firstInputRecord.setKey("first"); firstInputRecord.setValue(1); AvroKeyValue<CharSequence, Integer> secondInputRecord = new AvroKeyValue<CharSequence, Integer>( new GenericData.Record(keyValueSchema)); secondInputRecord.setKey("second"); secondInputRecord.setValue(2); final SeekableInput avroFileInput = new SeekableFileInput( AvroFiles.createFile(new File(mTempDir.getRoot(), "myInputFile.avro"), keyValueSchema, firstInputRecord.get(), secondInputRecord.get())); // Create the record reader over the avro input file. RecordReader<AvroKey<CharSequence>, AvroValue<Integer>> recordReader = new AvroKeyValueRecordReader<CharSequence, Integer>( Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)) { @Override protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return avroFileInput; } }; // Set up the job configuration. Configuration conf = new Configuration(); // Create a mock input split for this record reader. FileSplit inputSplit = createMock(FileSplit.class); expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes(); expect(inputSplit.getStart()).andReturn(0L).anyTimes(); expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes(); // Create a mock task attempt context for this record reader. TaskAttemptContext context = createMock(TaskAttemptContext.class); expect(context.getConfiguration()).andReturn(conf).anyTimes(); // Initialize the record reader. replay(inputSplit); replay(context); recordReader.initialize(inputSplit, context); assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f); // Some variables to hold the records. AvroKey<CharSequence> key; AvroValue<Integer> value; // Read the first record. assertTrue("Expected at least one record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("First record had null key", key); assertNotNull("First record had null value", value); assertEquals("first", key.datum().toString()); assertEquals(1, value.datum().intValue()); assertTrue("getCurrentKey() returned different keys for the same record", key == recordReader.getCurrentKey()); assertTrue("getCurrentValue() returned different values for the same record", value == recordReader.getCurrentValue()); // Read the second record. assertTrue("Expected to read a second record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("Second record had null key", key); assertNotNull("Second record had null value", value); assertEquals("second", key.datum().toString()); assertEquals(2, value.datum().intValue()); assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(), 0.0f); // There should be no more records. assertFalse("Expected only 2 records", recordReader.nextKeyValue()); // Close the record reader. recordReader.close(); // Verify the expected calls on the mocks. verify(inputSplit); verify(context); }
From source file:org.apache.carbondata.processing.loading.jsoninput.JsonInputFormat.java
License:Apache License
@Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { RecordReader<LongWritable, Text> rdr; if (JsonInputFormat.getOneRecordPerLine(context.getConfiguration())) { rdr = new SimpleJsonRecordReader(); } else {/*from w w w . ja va2s. co m*/ return new JsonRecordReader(); } rdr.initialize(split, context); return rdr; }
From source file:org.apache.carbondata.sdk.file.CarbonReaderBuilder.java
License:Apache License
/** * Build CarbonReader//ww w. j a v a 2 s . co m * * @param <T> * @return CarbonReader * @throws IOException * @throws InterruptedException */ public <T> CarbonReader<T> build() throws IOException, InterruptedException { if (hadoopConf == null) { hadoopConf = FileFactory.getConfiguration(); } CarbonTable table; // now always infer schema. TODO:Refactor in next version. table = CarbonTable.buildTable(tablePath, tableName, hadoopConf); final CarbonFileInputFormat format = new CarbonFileInputFormat(); final Job job = new Job(hadoopConf); format.setTableInfo(job.getConfiguration(), table.getTableInfo()); format.setTablePath(job.getConfiguration(), table.getTablePath()); format.setTableName(job.getConfiguration(), table.getTableName()); format.setDatabaseName(job.getConfiguration(), table.getDatabaseName()); if (filterExpression != null) { format.setFilterPredicates(job.getConfiguration(), filterExpression); } if (projectionColumns != null) { // set the user projection int len = projectionColumns.length; // TODO : Handle projection of complex child columns for (int i = 0; i < len; i++) { if (projectionColumns[i].contains(".")) { throw new UnsupportedOperationException( "Complex child columns projection NOT supported through CarbonReader"); } } format.setColumnProjection(job.getConfiguration(), projectionColumns); } try { if (filterExpression == null) { job.getConfiguration().set("filter_blocks", "false"); } List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID())); List<RecordReader<Void, T>> readers = new ArrayList<>(splits.size()); for (InputSplit split : splits) { TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader; QueryModel queryModel = format.createQueryModel(split, attempt); boolean hasComplex = false; for (ProjectionDimension projectionDimension : queryModel.getProjectionDimensions()) { if (projectionDimension.getDimension().isComplex()) { hasComplex = true; break; } } if (useVectorReader && !hasComplex) { queryModel.setDirectVectorFill(filterExpression == null); reader = new CarbonVectorizedRecordReader(queryModel); } else { reader = format.createRecordReader(split, attempt); } try { reader.initialize(split, attempt); readers.add(reader); } catch (Exception e) { CarbonUtil.closeStreams(readers.toArray(new RecordReader[0])); throw e; } } return new CarbonReader<>(readers); } catch (Exception ex) { // Clear the datamap cache as it can get added in getSplits() method DataMapStoreManager.getInstance().clearDataMaps(table.getAbsoluteTableIdentifier()); throw ex; } }
From source file:org.apache.carbondata.store.LocalCarbonStore.java
License:Apache License
@Override public Iterator<CarbonRow> scan(AbsoluteTableIdentifier tableIdentifier, String[] projectColumns, Expression filter) throws IOException { Objects.requireNonNull(tableIdentifier); Objects.requireNonNull(projectColumns); CarbonTable table = getTable(tableIdentifier.getTablePath()); if (table.isStreamingSink() || table.isHivePartitionTable()) { throw new UnsupportedOperationException("streaming and partition table is not supported"); }/*from w w w . j a v a 2 s. c o m*/ // TODO: use InputFormat to prune data and read data final CarbonTableInputFormat format = new CarbonTableInputFormat(); final Job job = new Job(new Configuration()); CarbonInputFormat.setTableInfo(job.getConfiguration(), table.getTableInfo()); CarbonInputFormat.setTablePath(job.getConfiguration(), table.getTablePath()); CarbonInputFormat.setTableName(job.getConfiguration(), table.getTableName()); CarbonInputFormat.setDatabaseName(job.getConfiguration(), table.getDatabaseName()); CarbonInputFormat.setCarbonReadSupport(job.getConfiguration(), CarbonRowReadSupport.class); CarbonInputFormat.setColumnProjection(job.getConfiguration(), new CarbonProjection(projectColumns)); if (filter != null) { CarbonInputFormat.setFilterPredicates(job.getConfiguration(), filter); } final List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID())); List<RecordReader<Void, Object>> readers = new ArrayList<>(splits.size()); List<CarbonRow> rows = new ArrayList<>(); try { for (InputSplit split : splits) { TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = format.createRecordReader(split, attempt); reader.initialize(split, attempt); readers.add(reader); } for (RecordReader<Void, Object> reader : readers) { while (reader.nextKeyValue()) { rows.add((CarbonRow) reader.getCurrentValue()); } try { reader.close(); } catch (IOException e) { LOGGER.error(e); } } } catch (InterruptedException e) { throw new IOException(e); } finally { for (RecordReader<Void, Object> reader : readers) { try { reader.close(); } catch (IOException e) { LOGGER.error(e); } } } return rows.iterator(); }
From source file:org.apache.crunch.io.hcatalog.HCatRecordDataIterable.java
License:Apache License
@Override public Iterator<HCatRecord> iterator() { try {/* w ww .j av a2 s. co m*/ Job job = Job.getInstance(bundle.configure(conf)); final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf); final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID()); return Iterators .concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<HCatRecord>>() { @Override public Iterator<HCatRecord> apply(InputSplit split) { RecordReader reader = null; try { reader = fmt.createRecordReader(split, ctxt); reader.initialize(split, ctxt); } catch (IOException | InterruptedException e) { throw new CrunchRuntimeException(e); } return new HCatRecordReaderIterator(reader); } }).iterator()); } catch (Exception e) { throw new CrunchRuntimeException(e); } }
From source file:org.apache.crunch.io.impl.DefaultFileReaderFactory.java
License:Apache License
@Override public Iterator<T> read(FileSystem fs, Path path) { final Configuration conf = new Configuration(fs.getConf()); bundle.configure(conf);/*from w w w . j a v a 2 s . c o m*/ ptype.initialize(conf); final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf); final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID()); try { Job job = new Job(conf); FileInputFormat.addInputPath(job, path); return Iterators.concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<T>>() { @Override public Iterator<T> apply(InputSplit split) { try { RecordReader reader = fmt.createRecordReader(split, ctxt); reader.initialize(split, ctxt); return new RecordReaderIterator<T>(reader, ptype); } catch (Exception e) { LOG.error("Error reading split: " + split, e); throw new CrunchRuntimeException(e); } } }).iterator()); } catch (Exception e) { LOG.error("Error reading path: " + path, e); throw new CrunchRuntimeException(e); } }
From source file:org.apache.crunch.kafka.inputformat.KafkaInputFormatIT.java
License:Apache License
@Test public void getSplitsCreateReaders() throws IOException, InterruptedException { List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10); Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);/*from ww w . j a v a 2s . c o m*/ Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic); Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>(); for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) { Long endingOffset = endOffsets.get(entry.getKey()); offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset)); } KafkaInputFormat.writeOffsetsToConfiguration(offsets, config); KafkaInputFormat inputFormat = new KafkaInputFormat(); inputFormat.setConf(config); List<InputSplit> splits = inputFormat.getSplits(null); assertThat(splits.size(), is(offsets.size())); for (InputSplit split : splits) { KafkaInputSplit inputSplit = (KafkaInputSplit) split; Pair<Long, Long> startEnd = offsets.get(inputSplit.getTopicPartition()); assertThat(inputSplit.getStartingOffset(), is(startEnd.first())); assertThat(inputSplit.getEndingOffset(), is(startEnd.second())); } //create readers and consume the data when(taskContext.getConfiguration()).thenReturn(config); Set<String> keysRead = new HashSet<>(); //read all data from all splits for (InputSplit split : splits) { KafkaInputSplit inputSplit = (KafkaInputSplit) split; long start = inputSplit.getStartingOffset(); long end = inputSplit.getEndingOffset(); RecordReader<BytesWritable, BytesWritable> recordReader = inputFormat.createRecordReader(split, taskContext); recordReader.initialize(split, taskContext); int numRecordsFound = 0; String currentKey; while (recordReader.nextKeyValue()) { currentKey = new String(recordReader.getCurrentKey().getBytes()); keysRead.add(currentKey); assertThat(keys, hasItem(currentKey)); assertThat(recordReader.getCurrentValue(), is(notNullValue())); numRecordsFound++; } recordReader.close(); //assert that it encountered a partitions worth of data assertThat(((long) numRecordsFound), is(end - start)); } //validate the same number of unique keys was read as were written. assertThat(keysRead.size(), is(keys.size())); }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testRead() throws IOException, InterruptedException { InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context); reader.nextKeyValue();/*from w ww . jav a 2s. com*/ OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testReadDateColumn() throws IOException, InterruptedException { File testFile2 = makeOrcFileWithDate(); Path path = new Path(testFile2.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile2.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context); reader.nextKeyValue();/*from w ww . j a va2 s . co m*/ OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }