List of usage examples for org.apache.hadoop.mapreduce RecordReader nextKeyValue
public abstract boolean nextKeyValue() throws IOException, InterruptedException;
From source file:org.apache.avro.mapreduce.TestAvroKeyRecordReader.java
License:Apache License
/** * Verifies that avro records can be read and progress is reported correctly. *//*from w w w . j a v a2s . c o m*/ @Test public void testReadRecords() throws IOException, InterruptedException { // Create the test avro file input with two records: // 1. "first" // 2. "second" final SeekableInput avroFileInput = new SeekableFileInput( AvroFiles.createFile(new File(mTempDir.getRoot(), "myStringfile.avro"), Schema.create(Schema.Type.STRING), "first", "second")); // Create the record reader. Schema readerSchema = Schema.create(Schema.Type.STRING); RecordReader<AvroKey<CharSequence>, NullWritable> recordReader = new AvroKeyRecordReader<CharSequence>( readerSchema) { @Override protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return avroFileInput; } }; // Set up the job configuration. Configuration conf = new Configuration(); // Create a mock input split for this record reader. FileSplit inputSplit = createMock(FileSplit.class); expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes(); expect(inputSplit.getStart()).andReturn(0L).anyTimes(); expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes(); // Create a mock task attempt context for this record reader. TaskAttemptContext context = createMock(TaskAttemptContext.class); expect(context.getConfiguration()).andReturn(conf).anyTimes(); // Initialize the record reader. replay(inputSplit); replay(context); recordReader.initialize(inputSplit, context); assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f); // Some variables to hold the records. AvroKey<CharSequence> key; NullWritable value; // Read the first record. assertTrue("Expected at least one record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("First record had null key", key); assertNotNull("First record had null value", value); CharSequence firstString = key.datum(); assertEquals("first", firstString.toString()); assertTrue("getCurrentKey() returned different keys for the same record", key == recordReader.getCurrentKey()); assertTrue("getCurrentValue() returned different values for the same record", value == recordReader.getCurrentValue()); // Read the second record. assertTrue("Expected to read a second record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("Second record had null key", key); assertNotNull("Second record had null value", value); CharSequence secondString = key.datum(); assertEquals("second", secondString.toString()); assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(), 0.0f); // There should be no more records. assertFalse("Expected only 2 records", recordReader.nextKeyValue()); // Close the record reader. recordReader.close(); // Verify the expected calls on the mocks. verify(inputSplit); verify(context); }
From source file:org.apache.avro.mapreduce.TestAvroKeyValueRecordReader.java
License:Apache License
/** * Verifies that avro records can be read and progress is reported correctly. *///from www . j a v a2 s . co m @Test public void testReadRecords() throws IOException, InterruptedException { // Create the test avro file input with two records: // 1. <"firstkey", 1> // 2. <"second", 2> Schema keyValueSchema = AvroKeyValue.getSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)); AvroKeyValue<CharSequence, Integer> firstInputRecord = new AvroKeyValue<CharSequence, Integer>( new GenericData.Record(keyValueSchema)); firstInputRecord.setKey("first"); firstInputRecord.setValue(1); AvroKeyValue<CharSequence, Integer> secondInputRecord = new AvroKeyValue<CharSequence, Integer>( new GenericData.Record(keyValueSchema)); secondInputRecord.setKey("second"); secondInputRecord.setValue(2); final SeekableInput avroFileInput = new SeekableFileInput( AvroFiles.createFile(new File(mTempDir.getRoot(), "myInputFile.avro"), keyValueSchema, firstInputRecord.get(), secondInputRecord.get())); // Create the record reader over the avro input file. RecordReader<AvroKey<CharSequence>, AvroValue<Integer>> recordReader = new AvroKeyValueRecordReader<CharSequence, Integer>( Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)) { @Override protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return avroFileInput; } }; // Set up the job configuration. Configuration conf = new Configuration(); // Create a mock input split for this record reader. FileSplit inputSplit = createMock(FileSplit.class); expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes(); expect(inputSplit.getStart()).andReturn(0L).anyTimes(); expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes(); // Create a mock task attempt context for this record reader. TaskAttemptContext context = createMock(TaskAttemptContext.class); expect(context.getConfiguration()).andReturn(conf).anyTimes(); // Initialize the record reader. replay(inputSplit); replay(context); recordReader.initialize(inputSplit, context); assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f); // Some variables to hold the records. AvroKey<CharSequence> key; AvroValue<Integer> value; // Read the first record. assertTrue("Expected at least one record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("First record had null key", key); assertNotNull("First record had null value", value); assertEquals("first", key.datum().toString()); assertEquals(1, value.datum().intValue()); assertTrue("getCurrentKey() returned different keys for the same record", key == recordReader.getCurrentKey()); assertTrue("getCurrentValue() returned different values for the same record", value == recordReader.getCurrentValue()); // Read the second record. assertTrue("Expected to read a second record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("Second record had null key", key); assertNotNull("Second record had null value", value); assertEquals("second", key.datum().toString()); assertEquals(2, value.datum().intValue()); assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(), 0.0f); // There should be no more records. assertFalse("Expected only 2 records", recordReader.nextKeyValue()); // Close the record reader. recordReader.close(); // Verify the expected calls on the mocks. verify(inputSplit); verify(context); }
From source file:org.apache.carbondata.store.LocalCarbonStore.java
License:Apache License
@Override public Iterator<CarbonRow> scan(AbsoluteTableIdentifier tableIdentifier, String[] projectColumns, Expression filter) throws IOException { Objects.requireNonNull(tableIdentifier); Objects.requireNonNull(projectColumns); CarbonTable table = getTable(tableIdentifier.getTablePath()); if (table.isStreamingSink() || table.isHivePartitionTable()) { throw new UnsupportedOperationException("streaming and partition table is not supported"); }//from w w w. java 2 s. c o m // TODO: use InputFormat to prune data and read data final CarbonTableInputFormat format = new CarbonTableInputFormat(); final Job job = new Job(new Configuration()); CarbonInputFormat.setTableInfo(job.getConfiguration(), table.getTableInfo()); CarbonInputFormat.setTablePath(job.getConfiguration(), table.getTablePath()); CarbonInputFormat.setTableName(job.getConfiguration(), table.getTableName()); CarbonInputFormat.setDatabaseName(job.getConfiguration(), table.getDatabaseName()); CarbonInputFormat.setCarbonReadSupport(job.getConfiguration(), CarbonRowReadSupport.class); CarbonInputFormat.setColumnProjection(job.getConfiguration(), new CarbonProjection(projectColumns)); if (filter != null) { CarbonInputFormat.setFilterPredicates(job.getConfiguration(), filter); } final List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID())); List<RecordReader<Void, Object>> readers = new ArrayList<>(splits.size()); List<CarbonRow> rows = new ArrayList<>(); try { for (InputSplit split : splits) { TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = format.createRecordReader(split, attempt); reader.initialize(split, attempt); readers.add(reader); } for (RecordReader<Void, Object> reader : readers) { while (reader.nextKeyValue()) { rows.add((CarbonRow) reader.getCurrentValue()); } try { reader.close(); } catch (IOException e) { LOGGER.error(e); } } } catch (InterruptedException e) { throw new IOException(e); } finally { for (RecordReader<Void, Object> reader : readers) { try { reader.close(); } catch (IOException e) { LOGGER.error(e); } } } return rows.iterator(); }
From source file:org.apache.crunch.kafka.inputformat.KafkaInputFormatIT.java
License:Apache License
@Test public void getSplitsCreateReaders() throws IOException, InterruptedException { List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10); Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);/*from w w w . j av a2 s. c o m*/ Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic); Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>(); for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) { Long endingOffset = endOffsets.get(entry.getKey()); offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset)); } KafkaInputFormat.writeOffsetsToConfiguration(offsets, config); KafkaInputFormat inputFormat = new KafkaInputFormat(); inputFormat.setConf(config); List<InputSplit> splits = inputFormat.getSplits(null); assertThat(splits.size(), is(offsets.size())); for (InputSplit split : splits) { KafkaInputSplit inputSplit = (KafkaInputSplit) split; Pair<Long, Long> startEnd = offsets.get(inputSplit.getTopicPartition()); assertThat(inputSplit.getStartingOffset(), is(startEnd.first())); assertThat(inputSplit.getEndingOffset(), is(startEnd.second())); } //create readers and consume the data when(taskContext.getConfiguration()).thenReturn(config); Set<String> keysRead = new HashSet<>(); //read all data from all splits for (InputSplit split : splits) { KafkaInputSplit inputSplit = (KafkaInputSplit) split; long start = inputSplit.getStartingOffset(); long end = inputSplit.getEndingOffset(); RecordReader<BytesWritable, BytesWritable> recordReader = inputFormat.createRecordReader(split, taskContext); recordReader.initialize(split, taskContext); int numRecordsFound = 0; String currentKey; while (recordReader.nextKeyValue()) { currentKey = new String(recordReader.getCurrentKey().getBytes()); keysRead.add(currentKey); assertThat(keys, hasItem(currentKey)); assertThat(recordReader.getCurrentValue(), is(notNullValue())); numRecordsFound++; } recordReader.close(); //assert that it encountered a partitions worth of data assertThat(((long) numRecordsFound), is(end - start)); } //validate the same number of unique keys was read as were written. assertThat(keysRead.size(), is(keys.size())); }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testRead() throws IOException, InterruptedException { InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context);/*from w w w . ja v a 2 s . c o m*/ reader.nextKeyValue(); OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testReadDateColumn() throws IOException, InterruptedException { File testFile2 = makeOrcFileWithDate(); Path path = new Path(testFile2.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile2.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context);//from ww w .j av a2s . c om reader.nextKeyValue(); OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:org.apache.hadoop.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample//from ww w . j a v a 2s . c o m * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(TeraSortConfigKeys.SAMPLE_SIZE.key(), TeraSortConfigKeys.DEFAULT_SAMPLE_SIZE); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min( conf.getInt(TeraSortConfigKeys.NUM_PARTITIONS.key(), TeraSortConfigKeys.DEFAULT_NUM_PARTITIONS), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:org.apache.hcatalog.pig.TestE2EScenarios.java
License:Apache License
private void copyTable(String in, String out) throws IOException, InterruptedException { Job ijob = new Job(); Job ojob = new Job(); HCatInputFormat inpy = new HCatInputFormat(); inpy.setInput(ijob, null, in);/*from w ww. jav a2s. c o m*/ HCatOutputFormat oupy = new HCatOutputFormat(); oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>())); // Test HCatContext System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent()); if (HCatContext.INSTANCE.getConf().isPresent()) { System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get() .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT)); } HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration()); System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString()); oupy.setSchema(ojob, tableSchema); oupy.checkOutputSpecs(ojob); OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration())); oc.setupJob(ojob); for (InputSplit split : inpy.getSplits(ijob)) { TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration()); TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration()); RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext); rr.initialize(split, rtaskContext); OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext); taskOc.setupTask(wtaskContext); RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext); while (rr.nextKeyValue()) { rw.write(rr.getCurrentKey(), rr.getCurrentValue()); } rw.close(wtaskContext); taskOc.commitTask(wtaskContext); rr.close(); } oc.commitJob(ojob); }
From source file:org.apache.hcatalog.rcfile.TestRCFileMapReduceInputFormat.java
License:Apache License
private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException { Path testDir = new Path(System.getProperty("test.data.dir", ".") + "/mapred/testsmallfirstsplit"); Path testFile = new Path(testDir, "test_rcfile"); fs.delete(testFile, true);//from w w w. j av a 2 s . co m Configuration cloneConf = new Configuration(conf); RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length); cloneConf.setInt(RCFile.RECORD_INTERVAL_CONF_STR, intervalRecordCount); RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec); BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length); for (int i = 0; i < bytesArray.length; i++) { BytesRefWritable cu = null; cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length); bytes.set(i, cu); } for (int i = 0; i < writeCount; i++) { writer.append(bytes); } writer.close(); RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>(); Configuration jonconf = new Configuration(cloneConf); jonconf.set("mapred.input.dir", testDir.toString()); JobContext context = new Job(jonconf); context.getConfiguration().setLong("mapred.max.split.size", maxSplitSize); List<InputSplit> splits = inputFormat.getSplits(context); assertEquals("splits length should be " + splitNumber, splits.size(), splitNumber); int readCount = 0; for (int i = 0; i < splits.size(); i++) { TaskAttemptContext tac = HCatHadoopShims.Instance.get().createTaskAttemptContext(jonconf, new TaskAttemptID()); RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac); rr.initialize(splits.get(i), tac); while (rr.nextKeyValue()) { readCount++; } } assertEquals("readCount should be equal to writeCount", readCount, writeCount); }
From source file:org.apache.hive.hcatalog.rcfile.TestRCFileMapReduceInputFormat.java
License:Apache License
private void writeThenReadByRecordReader(int intervalRecordCount, int writeCount, int splitNumber, long maxSplitSize, CompressionCodec codec) throws IOException, InterruptedException { Path testDir = new Path(System.getProperty("test.tmp.dir", ".") + "/mapred/testsmallfirstsplit"); Path testFile = new Path(testDir, "test_rcfile"); fs.delete(testFile, true);/*ww w.j a v a 2 s . c o m*/ Configuration cloneConf = new Configuration(conf); RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length); cloneConf.setInt(HiveConf.ConfVars.HIVE_RCFILE_RECORD_INTERVAL.varname, intervalRecordCount); RCFile.Writer writer = new RCFile.Writer(fs, cloneConf, testFile, null, codec); BytesRefArrayWritable bytes = new BytesRefArrayWritable(bytesArray.length); for (int i = 0; i < bytesArray.length; i++) { BytesRefWritable cu = null; cu = new BytesRefWritable(bytesArray[i], 0, bytesArray[i].length); bytes.set(i, cu); } for (int i = 0; i < writeCount; i++) { writer.append(bytes); } writer.close(); RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable> inputFormat = new RCFileMapReduceInputFormat<LongWritable, BytesRefArrayWritable>(); Configuration jonconf = new Configuration(cloneConf); jonconf.set("mapred.input.dir", testDir.toString()); JobContext context = new Job(jonconf); HiveConf.setLongVar(context.getConfiguration(), HiveConf.ConfVars.MAPREDMAXSPLITSIZE, maxSplitSize); List<InputSplit> splits = inputFormat.getSplits(context); assertEquals("splits length should be " + splitNumber, splits.size(), splitNumber); int readCount = 0; for (int i = 0; i < splits.size(); i++) { TaskAttemptContext tac = ShimLoader.getHadoopShims().getHCatShim().createTaskAttemptContext(jonconf, new TaskAttemptID()); RecordReader<LongWritable, BytesRefArrayWritable> rr = inputFormat.createRecordReader(splits.get(i), tac); rr.initialize(splits.get(i), tac); while (rr.nextKeyValue()) { readCount++; } } assertEquals("readCount should be equal to writeCount", readCount, writeCount); }