List of usage examples for org.apache.hadoop.mapreduce RecordReader getCurrentValue
public abstract VALUEIN getCurrentValue() throws IOException, InterruptedException;
From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java
License:Open Source License
public static void main(String[] args) throws IOException, InterruptedException { if (args.length != 2) { System.out.println("Usage: <input folder> <output file>"); System.exit(-1);/* w w w.j ava2s. c o m*/ } String inputPath = args[0]; String outputFile = args[1]; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() { @Override public boolean accept(Path path) { return path.toString().endsWith(".parquet"); } }); Path output = new Path(outputFile); fs.delete(output, true); ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>(); inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class); Job job = new Job(conf); ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>( ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class); ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); ProtoParquetOutputFormat.setEnableDictionary(job, true); RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output, CompressionCodecName.SNAPPY); List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>(); for (FileStatus fileStatus : input) { System.out.println(fileStatus.getPath().toString()); splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus))); } int splitIndex = 0; for (ParquetInputSplit split : splits) { System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of " + splits.size() + ")"); TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex), splitIndex); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split, ctx); reader.initialize(split, ctx); while (reader.nextKeyValue()) { ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue(); ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder(); builder.setUrl(record.getUrl()); builder.setArchiveTime(record.getArchiveTime()); builder.addAllScripts(record.getScriptsList()); builder.addAllIframes(record.getIframesList()); builder.addAllLinks(record.getLinksList()); builder.addAllImages(record.getImagesList()); recordWriter.write(null, builder.build()); } if (reader != null) { reader.close(); } splitIndex++; } TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1); TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID); if (recordWriter != null) { recordWriter.close(ctx); } }
From source file:it.crs4.pydoop.mapreduce.pipes.PipesMapper.java
License:Apache License
@Override public void run(Context context) throws IOException, InterruptedException { setup(context);/*from www . j a v a 2 s . c o m*/ Configuration conf = context.getConfiguration(); InputSplit split = context.getInputSplit(); // FIXME: do we really need to be so convoluted? InputFormat<K1, V1> inputFormat; try { inputFormat = (InputFormat<K1, V1>) ReflectionUtils.newInstance(context.getInputFormatClass(), conf); } catch (ClassNotFoundException ce) { throw new RuntimeException("class not found", ce); } RecordReader<K1, V1> input = inputFormat.createRecordReader(split, context); input.initialize(split, context); boolean isJavaInput = Submitter.getIsJavaRecordReader(conf); try { // FIXME: what happens for a java mapper and no java record reader? DummyRecordReader fakeInput = (!isJavaInput && !Submitter.getIsJavaMapper(conf)) ? (DummyRecordReader) input : null; application = new Application<K1, V1, K2, V2>(context, fakeInput); } catch (InterruptedException ie) { throw new RuntimeException("interrupted", ie); } DownwardProtocol<K1, V1> downlink = application.getDownlink(); // FIXME: InputSplit is not Writable, but still, this is ugly... downlink.runMap((FileSplit) context.getInputSplit(), context.getNumReduceTasks(), isJavaInput); boolean skipping = conf.getBoolean(context.SKIP_RECORDS, false); boolean sent_input_types = false; try { if (isJavaInput) { // FIXME while (input.nextKeyValue()) { if (!sent_input_types) { sent_input_types = true; NullWritable n = NullWritable.get(); String kclass_name = n.getClass().getName(); String vclass_name = n.getClass().getName(); if (input.getCurrentKey() != null) { kclass_name = input.getCurrentKey().getClass().getName(); } if (input.getCurrentValue() != null) { vclass_name = input.getCurrentValue().getClass().getName(); } downlink.setInputTypes(kclass_name, vclass_name); } downlink.mapItem(input.getCurrentKey(), input.getCurrentValue()); if (skipping) { //flush the streams on every record input if running in skip mode //so that we don't buffer other records surrounding a bad record. downlink.flush(); } } downlink.endOfInput(); } application.waitForFinish(); } catch (Throwable t) { application.abort(t); } finally { cleanup(context); } }
From source file:org.apache.avro.mapreduce.TestAvroKeyRecordReader.java
License:Apache License
/** * Verifies that avro records can be read and progress is reported correctly. */// www. ja v a 2s .co m @Test public void testReadRecords() throws IOException, InterruptedException { // Create the test avro file input with two records: // 1. "first" // 2. "second" final SeekableInput avroFileInput = new SeekableFileInput( AvroFiles.createFile(new File(mTempDir.getRoot(), "myStringfile.avro"), Schema.create(Schema.Type.STRING), "first", "second")); // Create the record reader. Schema readerSchema = Schema.create(Schema.Type.STRING); RecordReader<AvroKey<CharSequence>, NullWritable> recordReader = new AvroKeyRecordReader<CharSequence>( readerSchema) { @Override protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return avroFileInput; } }; // Set up the job configuration. Configuration conf = new Configuration(); // Create a mock input split for this record reader. FileSplit inputSplit = createMock(FileSplit.class); expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes(); expect(inputSplit.getStart()).andReturn(0L).anyTimes(); expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes(); // Create a mock task attempt context for this record reader. TaskAttemptContext context = createMock(TaskAttemptContext.class); expect(context.getConfiguration()).andReturn(conf).anyTimes(); // Initialize the record reader. replay(inputSplit); replay(context); recordReader.initialize(inputSplit, context); assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f); // Some variables to hold the records. AvroKey<CharSequence> key; NullWritable value; // Read the first record. assertTrue("Expected at least one record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("First record had null key", key); assertNotNull("First record had null value", value); CharSequence firstString = key.datum(); assertEquals("first", firstString.toString()); assertTrue("getCurrentKey() returned different keys for the same record", key == recordReader.getCurrentKey()); assertTrue("getCurrentValue() returned different values for the same record", value == recordReader.getCurrentValue()); // Read the second record. assertTrue("Expected to read a second record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("Second record had null key", key); assertNotNull("Second record had null value", value); CharSequence secondString = key.datum(); assertEquals("second", secondString.toString()); assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(), 0.0f); // There should be no more records. assertFalse("Expected only 2 records", recordReader.nextKeyValue()); // Close the record reader. recordReader.close(); // Verify the expected calls on the mocks. verify(inputSplit); verify(context); }
From source file:org.apache.avro.mapreduce.TestAvroKeyValueRecordReader.java
License:Apache License
/** * Verifies that avro records can be read and progress is reported correctly. *//* w w w. j ava 2 s .c o m*/ @Test public void testReadRecords() throws IOException, InterruptedException { // Create the test avro file input with two records: // 1. <"firstkey", 1> // 2. <"second", 2> Schema keyValueSchema = AvroKeyValue.getSchema(Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)); AvroKeyValue<CharSequence, Integer> firstInputRecord = new AvroKeyValue<CharSequence, Integer>( new GenericData.Record(keyValueSchema)); firstInputRecord.setKey("first"); firstInputRecord.setValue(1); AvroKeyValue<CharSequence, Integer> secondInputRecord = new AvroKeyValue<CharSequence, Integer>( new GenericData.Record(keyValueSchema)); secondInputRecord.setKey("second"); secondInputRecord.setValue(2); final SeekableInput avroFileInput = new SeekableFileInput( AvroFiles.createFile(new File(mTempDir.getRoot(), "myInputFile.avro"), keyValueSchema, firstInputRecord.get(), secondInputRecord.get())); // Create the record reader over the avro input file. RecordReader<AvroKey<CharSequence>, AvroValue<Integer>> recordReader = new AvroKeyValueRecordReader<CharSequence, Integer>( Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)) { @Override protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException { return avroFileInput; } }; // Set up the job configuration. Configuration conf = new Configuration(); // Create a mock input split for this record reader. FileSplit inputSplit = createMock(FileSplit.class); expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes(); expect(inputSplit.getStart()).andReturn(0L).anyTimes(); expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes(); // Create a mock task attempt context for this record reader. TaskAttemptContext context = createMock(TaskAttemptContext.class); expect(context.getConfiguration()).andReturn(conf).anyTimes(); // Initialize the record reader. replay(inputSplit); replay(context); recordReader.initialize(inputSplit, context); assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f); // Some variables to hold the records. AvroKey<CharSequence> key; AvroValue<Integer> value; // Read the first record. assertTrue("Expected at least one record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("First record had null key", key); assertNotNull("First record had null value", value); assertEquals("first", key.datum().toString()); assertEquals(1, value.datum().intValue()); assertTrue("getCurrentKey() returned different keys for the same record", key == recordReader.getCurrentKey()); assertTrue("getCurrentValue() returned different values for the same record", value == recordReader.getCurrentValue()); // Read the second record. assertTrue("Expected to read a second record", recordReader.nextKeyValue()); key = recordReader.getCurrentKey(); value = recordReader.getCurrentValue(); assertNotNull("Second record had null key", key); assertNotNull("Second record had null value", value); assertEquals("second", key.datum().toString()); assertEquals(2, value.datum().intValue()); assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(), 0.0f); // There should be no more records. assertFalse("Expected only 2 records", recordReader.nextKeyValue()); // Close the record reader. recordReader.close(); // Verify the expected calls on the mocks. verify(inputSplit); verify(context); }
From source file:org.apache.carbondata.store.LocalCarbonStore.java
License:Apache License
@Override public Iterator<CarbonRow> scan(AbsoluteTableIdentifier tableIdentifier, String[] projectColumns, Expression filter) throws IOException { Objects.requireNonNull(tableIdentifier); Objects.requireNonNull(projectColumns); CarbonTable table = getTable(tableIdentifier.getTablePath()); if (table.isStreamingSink() || table.isHivePartitionTable()) { throw new UnsupportedOperationException("streaming and partition table is not supported"); }/* w ww . j ava2s .co m*/ // TODO: use InputFormat to prune data and read data final CarbonTableInputFormat format = new CarbonTableInputFormat(); final Job job = new Job(new Configuration()); CarbonInputFormat.setTableInfo(job.getConfiguration(), table.getTableInfo()); CarbonInputFormat.setTablePath(job.getConfiguration(), table.getTablePath()); CarbonInputFormat.setTableName(job.getConfiguration(), table.getTableName()); CarbonInputFormat.setDatabaseName(job.getConfiguration(), table.getDatabaseName()); CarbonInputFormat.setCarbonReadSupport(job.getConfiguration(), CarbonRowReadSupport.class); CarbonInputFormat.setColumnProjection(job.getConfiguration(), new CarbonProjection(projectColumns)); if (filter != null) { CarbonInputFormat.setFilterPredicates(job.getConfiguration(), filter); } final List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID())); List<RecordReader<Void, Object>> readers = new ArrayList<>(splits.size()); List<CarbonRow> rows = new ArrayList<>(); try { for (InputSplit split : splits) { TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = format.createRecordReader(split, attempt); reader.initialize(split, attempt); readers.add(reader); } for (RecordReader<Void, Object> reader : readers) { while (reader.nextKeyValue()) { rows.add((CarbonRow) reader.getCurrentValue()); } try { reader.close(); } catch (IOException e) { LOGGER.error(e); } } } catch (InterruptedException e) { throw new IOException(e); } finally { for (RecordReader<Void, Object> reader : readers) { try { reader.close(); } catch (IOException e) { LOGGER.error(e); } } } return rows.iterator(); }
From source file:org.apache.crunch.kafka.inputformat.KafkaInputFormatIT.java
License:Apache License
@Test public void getSplitsCreateReaders() throws IOException, InterruptedException { List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10); Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(), topic);/*ww w . ja v a2 s . c o m*/ Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic); Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>(); for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) { Long endingOffset = endOffsets.get(entry.getKey()); offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset)); } KafkaInputFormat.writeOffsetsToConfiguration(offsets, config); KafkaInputFormat inputFormat = new KafkaInputFormat(); inputFormat.setConf(config); List<InputSplit> splits = inputFormat.getSplits(null); assertThat(splits.size(), is(offsets.size())); for (InputSplit split : splits) { KafkaInputSplit inputSplit = (KafkaInputSplit) split; Pair<Long, Long> startEnd = offsets.get(inputSplit.getTopicPartition()); assertThat(inputSplit.getStartingOffset(), is(startEnd.first())); assertThat(inputSplit.getEndingOffset(), is(startEnd.second())); } //create readers and consume the data when(taskContext.getConfiguration()).thenReturn(config); Set<String> keysRead = new HashSet<>(); //read all data from all splits for (InputSplit split : splits) { KafkaInputSplit inputSplit = (KafkaInputSplit) split; long start = inputSplit.getStartingOffset(); long end = inputSplit.getEndingOffset(); RecordReader<BytesWritable, BytesWritable> recordReader = inputFormat.createRecordReader(split, taskContext); recordReader.initialize(split, taskContext); int numRecordsFound = 0; String currentKey; while (recordReader.nextKeyValue()) { currentKey = new String(recordReader.getCurrentKey().getBytes()); keysRead.add(currentKey); assertThat(keys, hasItem(currentKey)); assertThat(recordReader.getCurrentValue(), is(notNullValue())); numRecordsFound++; } recordReader.close(); //assert that it encountered a partitions worth of data assertThat(((long) numRecordsFound), is(end - start)); } //validate the same number of unique keys was read as were written. assertThat(keysRead.size(), is(keys.size())); }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testRead() throws IOException, InterruptedException { InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context);/*from w w w. j ava 2s . co m*/ reader.nextKeyValue(); OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testReadDateColumn() throws IOException, InterruptedException { File testFile2 = makeOrcFileWithDate(); Path path = new Path(testFile2.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile2.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context);/*from w w w . j a v a 2 s .c om*/ reader.nextKeyValue(); OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:org.apache.hcatalog.pig.TestE2EScenarios.java
License:Apache License
private void copyTable(String in, String out) throws IOException, InterruptedException { Job ijob = new Job(); Job ojob = new Job(); HCatInputFormat inpy = new HCatInputFormat(); inpy.setInput(ijob, null, in);/*from w w w . j a va 2s. c om*/ HCatOutputFormat oupy = new HCatOutputFormat(); oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>())); // Test HCatContext System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent()); if (HCatContext.INSTANCE.getConf().isPresent()) { System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get() .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION, HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT)); } HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration()); System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString()); oupy.setSchema(ojob, tableSchema); oupy.checkOutputSpecs(ojob); OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration())); oc.setupJob(ojob); for (InputSplit split : inpy.getSplits(ijob)) { TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration()); TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration()); RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext); rr.initialize(split, rtaskContext); OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext); taskOc.setupTask(wtaskContext); RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext); while (rr.nextKeyValue()) { rw.write(rr.getCurrentKey(), rr.getCurrentValue()); } rw.close(wtaskContext); taskOc.commitTask(wtaskContext); rr.close(); } oc.commitJob(ojob); }
From source file:org.apache.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final List<FileSplit> inputSplits = splitsFactory.getSplits(); return new AbstractUnaryOutputSourceOperatorNodePushable() { private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); private ContextFactory ctxFactory = new ContextFactory(); @SuppressWarnings("unchecked") @Override// w ww . j a v a2 s. c o m public void initialize() throws HyracksDataException { ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { writer.open(); Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader()); Job job = confFactory.getConf(); job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx); InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); int size = inputSplits.size(); for (int i = 0; i < size; i++) { /** * read all the partitions scheduled to the current node */ if (scheduledLocations[i].equals(nodeName)) { /** * pick an unread split to read synchronize among * simultaneous partitions in the same machine */ synchronized (executed) { if (executed[i] == false) { executed[i] = true; } else { continue; } } /** * read the split */ TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i); context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); while (reader.nextKeyValue() == true) { parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString()); } } } parser.close(writer); } catch (Throwable th) { writer.fail(); throw new HyracksDataException(th); } finally { writer.close(); Thread.currentThread().setContextClassLoader(ctxCL); } } }; }