List of usage examples for org.apache.hadoop.mapreduce InputFormat createRecordReader
public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.apache.crunch.io.hcatalog.HCatRecordDataIterable.java
License:Apache License
@Override public Iterator<HCatRecord> iterator() { try {//from w w w .jav a 2 s .com Job job = Job.getInstance(bundle.configure(conf)); final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf); final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID()); return Iterators .concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<HCatRecord>>() { @Override public Iterator<HCatRecord> apply(InputSplit split) { RecordReader reader = null; try { reader = fmt.createRecordReader(split, ctxt); reader.initialize(split, ctxt); } catch (IOException | InterruptedException e) { throw new CrunchRuntimeException(e); } return new HCatRecordReaderIterator(reader); } }).iterator()); } catch (Exception e) { throw new CrunchRuntimeException(e); } }
From source file:org.apache.crunch.io.impl.DefaultFileReaderFactory.java
License:Apache License
@Override public Iterator<T> read(FileSystem fs, Path path) { final Configuration conf = new Configuration(fs.getConf()); bundle.configure(conf);//ww w. j a v a2 s.c o m ptype.initialize(conf); final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf); final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID()); try { Job job = new Job(conf); FileInputFormat.addInputPath(job, path); return Iterators.concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<T>>() { @Override public Iterator<T> apply(InputSplit split) { try { RecordReader reader = fmt.createRecordReader(split, ctxt); reader.initialize(split, ctxt); return new RecordReaderIterator<T>(reader, ptype); } catch (Exception e) { LOG.error("Error reading split: " + split, e); throw new CrunchRuntimeException(e); } } }).iterator()); } catch (Exception e) { LOG.error("Error reading path: " + path, e); throw new CrunchRuntimeException(e); } }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testRead() throws IOException, InterruptedException { InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context);/*from w w w. j av a 2 s .co m*/ reader.nextKeyValue(); OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java
License:Apache License
@Test public void testReadDateColumn() throws IOException, InterruptedException { File testFile2 = makeOrcFileWithDate(); Path path = new Path(testFile2.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile2.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader reader = inputFormat.createRecordReader(split, context); InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser(); reader.initialize(split, context);/* w w w .j a v a 2 s . c o m*/ reader.nextKeyValue(); OrcStruct data = (OrcStruct) reader.getCurrentValue(); MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0); Assert.assertTrue(row.getEvent().keySet().size() == 4); Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp()); Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions()); Assert.assertEquals(col1, row.getEvent().get("col1")); Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2")); reader.close(); }
From source file:org.apache.druid.data.input.orc.OrcHadoopInputRowParserTest.java
License:Apache License
private static OrcStruct getFirstRow(Job job, String orcPath) throws IOException, InterruptedException { File testFile = new File(orcPath); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(OrcInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); try (RecordReader reader = inputFormat.createRecordReader(split, context)) { reader.initialize(split, context); reader.nextKeyValue();/*from w w w . j a v a 2 s . c o m*/ return (OrcStruct) reader.getCurrentValue(); } }
From source file:org.apache.druid.data.input.orc.OrcHadoopInputRowParserTest.java
License:Apache License
private static List<InputRow> getAllRows(HadoopDruidIndexerConfig config) throws IOException, InterruptedException { Job job = Job.getInstance(new Configuration()); config.intoConfiguration(job);//from ww w. j a va 2 s. c o m File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths()); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(OrcInputFormat.class, job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); try (RecordReader reader = inputFormat.createRecordReader(split, context)) { List<InputRow> records = new ArrayList<>(); InputRowParser parser = config.getParser(); reader.initialize(split, context); while (reader.nextKeyValue()) { reader.nextKeyValue(); Object data = reader.getCurrentValue(); records.add(((List<InputRow>) parser.parseBatch(data)).get(0)); } return records; } }
From source file:org.apache.druid.data.input.parquet.BaseParquetInputTest.java
License:Apache License
static Object getFirstRow(Job job, String parserType, String parquetPath) throws IOException, InterruptedException { File testFile = new File(parquetPath); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass.get(parserType), job.getConfiguration());// ww w. ja v a 2s. c o m TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); try (RecordReader reader = inputFormat.createRecordReader(split, context)) { reader.initialize(split, context); reader.nextKeyValue(); return reader.getCurrentValue(); } }
From source file:org.apache.druid.data.input.parquet.BaseParquetInputTest.java
License:Apache License
static List<InputRow> getAllRows(String parserType, HadoopDruidIndexerConfig config) throws IOException, InterruptedException { Job job = Job.getInstance(new Configuration()); config.intoConfiguration(job);//from w w w . j a v a2 s . co m File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths()); Path path = new Path(testFile.getAbsoluteFile().toURI()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass.get(parserType), job.getConfiguration()); TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); try (RecordReader reader = inputFormat.createRecordReader(split, context)) { List<InputRow> records = new ArrayList<>(); InputRowParser parser = config.getParser(); reader.initialize(split, context); while (reader.nextKeyValue()) { reader.nextKeyValue(); Object data = reader.getCurrentValue(); records.add(((List<InputRow>) parser.parseBatch(data)).get(0)); } return records; } }
From source file:org.apache.hyracks.dataflow.hadoop.mapreduce.MapperOperatorDescriptor.java
License:Apache License
@SuppressWarnings("deprecation") @Override//from ww w. j av a2s . c om public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final HadoopHelper helper = new HadoopHelper(config); final Configuration conf = helper.getConfiguration(); final Mapper<K1, V1, K2, V2> mapper = helper.getMapper(); final InputFormat<K1, V1> inputFormat = helper.getInputFormat(); final IInputSplitProvider isp = factory.createInputSplitProvider(partition); final TaskAttemptID taId = new TaskAttemptID("foo", jobId, true, partition, 0); final TaskAttemptContext taskAttemptContext = helper.createTaskAttemptContext(taId); final int framesLimit = helper.getSortFrameLimit(ctx); final IBinaryComparatorFactory[] comparatorFactories = helper.getSortComparatorFactories(); class SortingRecordWriter extends RecordWriter<K2, V2> { private final ArrayTupleBuilder tb; private final IFrame frame; private final FrameTupleAppender fta; private ExternalSortRunGenerator runGen; private int blockId; public SortingRecordWriter() throws HyracksDataException { tb = new ArrayTupleBuilder(2); frame = new VSizeFrame(ctx); fta = new FrameTupleAppender(frame); } public void initBlock(int blockId) throws HyracksDataException { runGen = new ExternalSortRunGenerator(ctx, new int[] { 0 }, null, comparatorFactories, helper.getMapOutputRecordDescriptorWithoutExtraFields(), Algorithm.MERGE_SORT, framesLimit); this.blockId = blockId; } @Override public void close(TaskAttemptContext arg0) throws IOException, InterruptedException { } @Override public void write(K2 key, V2 value) throws IOException, InterruptedException { DataOutput dos = tb.getDataOutput(); tb.reset(); key.write(dos); tb.addFieldEndOffset(); value.write(dos); tb.addFieldEndOffset(); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { runGen.nextFrame(frame.getBuffer()); fta.reset(frame, true); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new HyracksDataException("Record size (" + tb.getSize() + ") larger than frame size (" + frame.getBuffer().capacity() + ")"); } } } public void sortAndFlushBlock(final IFrameWriter writer) throws HyracksDataException { if (fta.getTupleCount() > 0) { runGen.nextFrame(frame.getBuffer()); fta.reset(frame, true); } runGen.close(); IFrameWriter delegatingWriter = new IFrameWriter() { private final FrameTupleAppender appender = new FrameTupleAppender(new VSizeFrame(ctx)); private final FrameTupleAccessor fta = new FrameTupleAccessor( helper.getMapOutputRecordDescriptorWithoutExtraFields()); private final ArrayTupleBuilder tb = new ArrayTupleBuilder(3); @Override public void open() throws HyracksDataException { } @Override public void nextFrame(ByteBuffer buffer) throws HyracksDataException { fta.reset(buffer); int n = fta.getTupleCount(); for (int i = 0; i < n; ++i) { tb.reset(); tb.addField(fta, i, 0); tb.addField(fta, i, 1); try { tb.getDataOutput().writeInt(blockId); } catch (IOException e) { throw new HyracksDataException(e); } tb.addFieldEndOffset(); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { appender.flush(writer, true); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new IllegalStateException(); } } } } @Override public void close() throws HyracksDataException { appender.flush(writer, true); } @Override public void fail() throws HyracksDataException { // TODO Auto-generated method stub } }; if (helper.hasCombiner()) { Reducer<K2, V2, K2, V2> combiner = helper.getCombiner(); TaskAttemptID ctaId = new TaskAttemptID("foo", jobId, true, partition, 0); TaskAttemptContext ctaskAttemptContext = helper.createTaskAttemptContext(taId); final IFrameWriter outputWriter = delegatingWriter; RecordWriter<K2, V2> recordWriter = new RecordWriter<K2, V2>() { private final FrameTupleAppender fta = new FrameTupleAppender(new VSizeFrame(ctx)); private final ArrayTupleBuilder tb = new ArrayTupleBuilder(2); { outputWriter.open(); } @Override public void write(K2 key, V2 value) throws IOException, InterruptedException { DataOutput dos = tb.getDataOutput(); tb.reset(); key.write(dos); tb.addFieldEndOffset(); value.write(dos); tb.addFieldEndOffset(); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { fta.flush(outputWriter, true); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new IllegalStateException(); } } } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { fta.flush(outputWriter, true); } }; delegatingWriter = new ReduceWriter<K2, V2, K2, V2>(ctx, helper, new int[] { HadoopHelper.KEY_FIELD_INDEX }, helper.getGroupingComparatorFactories(), helper.getMapOutputRecordDescriptorWithoutExtraFields(), combiner, recordWriter, ctaId, ctaskAttemptContext); } IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length]; for (int i = 0; i < comparatorFactories.length; ++i) { comparators[i] = comparatorFactories[i].createBinaryComparator(); } ExternalSortRunMerger merger = new ExternalSortRunMerger(ctx, runGen.getSorter(), runGen.getRuns(), new int[] { 0 }, comparators, null, helper.getMapOutputRecordDescriptorWithoutExtraFields(), framesLimit, delegatingWriter); merger.process(); } } return new AbstractUnaryOutputSourceOperatorNodePushable() { @SuppressWarnings("unchecked") @Override public void initialize() throws HyracksDataException { try { writer.open(); SortingRecordWriter recordWriter = new SortingRecordWriter(); InputSplit split = null; int blockId = 0; while ((split = isp.next()) != null) { try { RecordReader<K1, V1> recordReader = inputFormat.createRecordReader(split, taskAttemptContext); ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); recordReader.initialize(split, taskAttemptContext); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } recordWriter.initBlock(blockId); Mapper<K1, V1, K2, V2>.Context mCtx = new MRContextUtil().createMapContext(conf, taId, recordReader, recordWriter, null, null, split); mapper.run(mCtx); recordReader.close(); recordWriter.sortAndFlushBlock(writer); ++blockId; } catch (IOException e) { throw new HyracksDataException(e); } catch (InterruptedException e) { throw new HyracksDataException(e); } } } catch (Throwable th) { writer.fail(); throw th; } finally { writer.close(); } } }; }
From source file:org.apache.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final List<FileSplit> inputSplits = splitsFactory.getSplits(); return new AbstractUnaryOutputSourceOperatorNodePushable() { private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); private ContextFactory ctxFactory = new ContextFactory(); @SuppressWarnings("unchecked") @Override/*from ww w. j a v a2s . c o m*/ public void initialize() throws HyracksDataException { ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { writer.open(); Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader()); Job job = confFactory.getConf(); job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx); InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); int size = inputSplits.size(); for (int i = 0; i < size; i++) { /** * read all the partitions scheduled to the current node */ if (scheduledLocations[i].equals(nodeName)) { /** * pick an unread split to read synchronize among * simultaneous partitions in the same machine */ synchronized (executed) { if (executed[i] == false) { executed[i] = true; } else { continue; } } /** * read the split */ TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i); context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); while (reader.nextKeyValue() == true) { parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString()); } } } parser.close(writer); } catch (Throwable th) { writer.fail(); throw new HyracksDataException(th); } finally { writer.close(); Thread.currentThread().setContextClassLoader(ctxCL); } } }; }