Example usage for org.apache.hadoop.mapreduce InputFormat createRecordReader

List of usage examples for org.apache.hadoop.mapreduce InputFormat createRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce InputFormat createRecordReader.

Prototype

public abstract RecordReader<K, V> createRecordReader(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Create a record reader for a given split.

Usage

From source file:org.apache.crunch.io.hcatalog.HCatRecordDataIterable.java

License:Apache License

@Override
public Iterator<HCatRecord> iterator() {
    try {//from w  w w .jav  a 2  s .com
        Job job = Job.getInstance(bundle.configure(conf));

        final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf);
        final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID());

        return Iterators
                .concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<HCatRecord>>() {

                    @Override
                    public Iterator<HCatRecord> apply(InputSplit split) {
                        RecordReader reader = null;
                        try {
                            reader = fmt.createRecordReader(split, ctxt);
                            reader.initialize(split, ctxt);
                        } catch (IOException | InterruptedException e) {
                            throw new CrunchRuntimeException(e);
                        }
                        return new HCatRecordReaderIterator(reader);
                    }
                }).iterator());
    } catch (Exception e) {
        throw new CrunchRuntimeException(e);
    }
}

From source file:org.apache.crunch.io.impl.DefaultFileReaderFactory.java

License:Apache License

@Override
public Iterator<T> read(FileSystem fs, Path path) {
    final Configuration conf = new Configuration(fs.getConf());
    bundle.configure(conf);//ww  w. j  a  v a2  s.c  o  m
    ptype.initialize(conf);

    final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf);
    final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    try {
        Job job = new Job(conf);
        FileInputFormat.addInputPath(job, path);
        return Iterators.concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<T>>() {
            @Override
            public Iterator<T> apply(InputSplit split) {
                try {
                    RecordReader reader = fmt.createRecordReader(split, ctxt);
                    reader.initialize(split, ctxt);
                    return new RecordReaderIterator<T>(reader, ptype);
                } catch (Exception e) {
                    LOG.error("Error reading split: " + split, e);
                    throw new CrunchRuntimeException(e);
                }
            }
        }).iterator());
    } catch (Exception e) {
        LOG.error("Error reading path: " + path, e);
        throw new CrunchRuntimeException(e);
    }
}

From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testRead() throws IOException, InterruptedException {
    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser();

    reader.initialize(split, context);/*from   w  w  w.  j  av  a 2 s  .co m*/

    reader.nextKeyValue();

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}

From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testReadDateColumn() throws IOException, InterruptedException {
    File testFile2 = makeOrcFileWithDate();
    Path path = new Path(testFile2.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile2.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser();

    reader.initialize(split, context);/* w w w  .j a  v  a 2 s  . c  o m*/

    reader.nextKeyValue();

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}

From source file:org.apache.druid.data.input.orc.OrcHadoopInputRowParserTest.java

License:Apache License

private static OrcStruct getFirstRow(Job job, String orcPath) throws IOException, InterruptedException {
    File testFile = new File(orcPath);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(OrcInputFormat.class, job.getConfiguration());
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());

    try (RecordReader reader = inputFormat.createRecordReader(split, context)) {

        reader.initialize(split, context);
        reader.nextKeyValue();/*from   w w  w . j  a v  a  2 s . c  o  m*/
        return (OrcStruct) reader.getCurrentValue();
    }
}

From source file:org.apache.druid.data.input.orc.OrcHadoopInputRowParserTest.java

License:Apache License

private static List<InputRow> getAllRows(HadoopDruidIndexerConfig config)
        throws IOException, InterruptedException {
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);//from   ww  w.  j  a va 2  s.  c  o m

    File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths());
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(OrcInputFormat.class, job.getConfiguration());
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());

    try (RecordReader reader = inputFormat.createRecordReader(split, context)) {
        List<InputRow> records = new ArrayList<>();
        InputRowParser parser = config.getParser();

        reader.initialize(split, context);
        while (reader.nextKeyValue()) {
            reader.nextKeyValue();
            Object data = reader.getCurrentValue();
            records.add(((List<InputRow>) parser.parseBatch(data)).get(0));
        }

        return records;
    }
}

From source file:org.apache.druid.data.input.parquet.BaseParquetInputTest.java

License:Apache License

static Object getFirstRow(Job job, String parserType, String parquetPath)
        throws IOException, InterruptedException {
    File testFile = new File(parquetPath);
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass.get(parserType),
            job.getConfiguration());//  ww w. ja  v  a 2s. c o m
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());

    try (RecordReader reader = inputFormat.createRecordReader(split, context)) {

        reader.initialize(split, context);
        reader.nextKeyValue();
        return reader.getCurrentValue();
    }
}

From source file:org.apache.druid.data.input.parquet.BaseParquetInputTest.java

License:Apache License

static List<InputRow> getAllRows(String parserType, HadoopDruidIndexerConfig config)
        throws IOException, InterruptedException {
    Job job = Job.getInstance(new Configuration());
    config.intoConfiguration(job);//from   w w w  . j a  v a2  s  . co m

    File testFile = new File(((StaticPathSpec) config.getPathSpec()).getPaths());
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass.get(parserType),
            job.getConfiguration());
    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());

    try (RecordReader reader = inputFormat.createRecordReader(split, context)) {
        List<InputRow> records = new ArrayList<>();
        InputRowParser parser = config.getParser();

        reader.initialize(split, context);
        while (reader.nextKeyValue()) {
            reader.nextKeyValue();
            Object data = reader.getCurrentValue();
            records.add(((List<InputRow>) parser.parseBatch(data)).get(0));
        }

        return records;
    }
}

From source file:org.apache.hyracks.dataflow.hadoop.mapreduce.MapperOperatorDescriptor.java

License:Apache License

@SuppressWarnings("deprecation")
@Override//from ww  w. j  av a2s  .  c om
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final HadoopHelper helper = new HadoopHelper(config);
    final Configuration conf = helper.getConfiguration();
    final Mapper<K1, V1, K2, V2> mapper = helper.getMapper();
    final InputFormat<K1, V1> inputFormat = helper.getInputFormat();
    final IInputSplitProvider isp = factory.createInputSplitProvider(partition);
    final TaskAttemptID taId = new TaskAttemptID("foo", jobId, true, partition, 0);
    final TaskAttemptContext taskAttemptContext = helper.createTaskAttemptContext(taId);

    final int framesLimit = helper.getSortFrameLimit(ctx);
    final IBinaryComparatorFactory[] comparatorFactories = helper.getSortComparatorFactories();

    class SortingRecordWriter extends RecordWriter<K2, V2> {
        private final ArrayTupleBuilder tb;
        private final IFrame frame;
        private final FrameTupleAppender fta;
        private ExternalSortRunGenerator runGen;
        private int blockId;

        public SortingRecordWriter() throws HyracksDataException {
            tb = new ArrayTupleBuilder(2);
            frame = new VSizeFrame(ctx);
            fta = new FrameTupleAppender(frame);
        }

        public void initBlock(int blockId) throws HyracksDataException {
            runGen = new ExternalSortRunGenerator(ctx, new int[] { 0 }, null, comparatorFactories,
                    helper.getMapOutputRecordDescriptorWithoutExtraFields(), Algorithm.MERGE_SORT, framesLimit);
            this.blockId = blockId;
        }

        @Override
        public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
        }

        @Override
        public void write(K2 key, V2 value) throws IOException, InterruptedException {
            DataOutput dos = tb.getDataOutput();
            tb.reset();
            key.write(dos);
            tb.addFieldEndOffset();
            value.write(dos);
            tb.addFieldEndOffset();
            if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                runGen.nextFrame(frame.getBuffer());
                fta.reset(frame, true);
                if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                    throw new HyracksDataException("Record size (" + tb.getSize() + ") larger than frame size ("
                            + frame.getBuffer().capacity() + ")");
                }
            }
        }

        public void sortAndFlushBlock(final IFrameWriter writer) throws HyracksDataException {
            if (fta.getTupleCount() > 0) {
                runGen.nextFrame(frame.getBuffer());
                fta.reset(frame, true);
            }
            runGen.close();
            IFrameWriter delegatingWriter = new IFrameWriter() {
                private final FrameTupleAppender appender = new FrameTupleAppender(new VSizeFrame(ctx));
                private final FrameTupleAccessor fta = new FrameTupleAccessor(
                        helper.getMapOutputRecordDescriptorWithoutExtraFields());
                private final ArrayTupleBuilder tb = new ArrayTupleBuilder(3);

                @Override
                public void open() throws HyracksDataException {
                }

                @Override
                public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
                    fta.reset(buffer);
                    int n = fta.getTupleCount();
                    for (int i = 0; i < n; ++i) {
                        tb.reset();
                        tb.addField(fta, i, 0);
                        tb.addField(fta, i, 1);
                        try {
                            tb.getDataOutput().writeInt(blockId);
                        } catch (IOException e) {
                            throw new HyracksDataException(e);
                        }
                        tb.addFieldEndOffset();
                        if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                            appender.flush(writer, true);
                            if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                                throw new IllegalStateException();
                            }
                        }
                    }
                }

                @Override
                public void close() throws HyracksDataException {
                    appender.flush(writer, true);
                }

                @Override
                public void fail() throws HyracksDataException {
                    // TODO Auto-generated method stub

                }
            };
            if (helper.hasCombiner()) {
                Reducer<K2, V2, K2, V2> combiner = helper.getCombiner();
                TaskAttemptID ctaId = new TaskAttemptID("foo", jobId, true, partition, 0);
                TaskAttemptContext ctaskAttemptContext = helper.createTaskAttemptContext(taId);
                final IFrameWriter outputWriter = delegatingWriter;
                RecordWriter<K2, V2> recordWriter = new RecordWriter<K2, V2>() {
                    private final FrameTupleAppender fta = new FrameTupleAppender(new VSizeFrame(ctx));
                    private final ArrayTupleBuilder tb = new ArrayTupleBuilder(2);

                    {
                        outputWriter.open();
                    }

                    @Override
                    public void write(K2 key, V2 value) throws IOException, InterruptedException {
                        DataOutput dos = tb.getDataOutput();
                        tb.reset();
                        key.write(dos);
                        tb.addFieldEndOffset();
                        value.write(dos);
                        tb.addFieldEndOffset();
                        if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                            fta.flush(outputWriter, true);
                            if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                                throw new IllegalStateException();
                            }
                        }
                    }

                    @Override
                    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
                        fta.flush(outputWriter, true);
                    }
                };
                delegatingWriter = new ReduceWriter<K2, V2, K2, V2>(ctx, helper,
                        new int[] { HadoopHelper.KEY_FIELD_INDEX }, helper.getGroupingComparatorFactories(),
                        helper.getMapOutputRecordDescriptorWithoutExtraFields(), combiner, recordWriter, ctaId,
                        ctaskAttemptContext);
            }
            IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length];
            for (int i = 0; i < comparatorFactories.length; ++i) {
                comparators[i] = comparatorFactories[i].createBinaryComparator();
            }
            ExternalSortRunMerger merger = new ExternalSortRunMerger(ctx, runGen.getSorter(), runGen.getRuns(),
                    new int[] { 0 }, comparators, null, helper.getMapOutputRecordDescriptorWithoutExtraFields(),
                    framesLimit, delegatingWriter);
            merger.process();
        }
    }

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        @SuppressWarnings("unchecked")
        @Override
        public void initialize() throws HyracksDataException {
            try {
                writer.open();
                SortingRecordWriter recordWriter = new SortingRecordWriter();
                InputSplit split = null;
                int blockId = 0;
                while ((split = isp.next()) != null) {
                    try {
                        RecordReader<K1, V1> recordReader = inputFormat.createRecordReader(split,
                                taskAttemptContext);
                        ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
                        try {
                            Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
                            recordReader.initialize(split, taskAttemptContext);
                        } finally {
                            Thread.currentThread().setContextClassLoader(ctxCL);
                        }
                        recordWriter.initBlock(blockId);
                        Mapper<K1, V1, K2, V2>.Context mCtx = new MRContextUtil().createMapContext(conf, taId,
                                recordReader, recordWriter, null, null, split);
                        mapper.run(mCtx);
                        recordReader.close();
                        recordWriter.sortAndFlushBlock(writer);
                        ++blockId;
                    } catch (IOException e) {
                        throw new HyracksDataException(e);
                    } catch (InterruptedException e) {
                        throw new HyracksDataException(e);
                    }
                }
            } catch (Throwable th) {
                writer.fail();
                throw th;
            } finally {
                writer.close();
            }
        }
    };
}

From source file:org.apache.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final List<FileSplit> inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();
        private ContextFactory ctxFactory = new ContextFactory();

        @SuppressWarnings("unchecked")
        @Override/*from   ww  w.  j  a v  a2s . c o  m*/
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                writer.open();
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                Job job = confFactory.getConf();
                job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(),
                        job.getConfiguration());
                int size = inputSplits.size();
                for (int i = 0; i < size; i++) {
                    /**
                     * read all the partitions scheduled to the current node
                     */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                         * pick an unread split to read synchronize among
                         * simultaneous partitions in the same machine
                         */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }

                        /**
                         * read the split
                         */
                        TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
                        context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                        RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                        reader.initialize(inputSplits.get(i), context);
                        while (reader.nextKeyValue() == true) {
                            parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer,
                                    inputSplits.get(i).toString());
                        }
                    }
                }
                parser.close(writer);
            } catch (Throwable th) {
                writer.fail();
                throw new HyracksDataException(th);
            } finally {
                writer.close();
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}