Example usage for org.apache.hadoop.mapreduce RecordReader initialize

List of usage examples for org.apache.hadoop.mapreduce RecordReader initialize

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader initialize.

Prototype

public abstract void initialize(InputSplit split, TaskAttemptContext context)
        throws IOException, InterruptedException;

Source Link

Document

Called once at initialization.

Usage

From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.GSRowInputFormat.java

License:Apache License

@Override
public RecordReader<GSColumnKeyWritable, GSRowWritable> createRecordReader(InputSplit split,
        TaskAttemptContext context) throws IOException, InterruptedException {
    RecordReader<GSColumnKeyWritable, GSRowWritable> reader = new GSRowRecordReader();
    reader.initialize(split, context);
    return reader;
}

From source file:edu.uci.ics.hyracks.dataflow.hadoop.mapreduce.MapperOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final HadoopHelper helper = new HadoopHelper(config);
    final Configuration conf = helper.getConfiguration();
    final Mapper<K1, V1, K2, V2> mapper = helper.getMapper();
    final InputFormat<K1, V1> inputFormat = helper.getInputFormat();
    final IInputSplitProvider isp = factory.createInputSplitProvider(partition);
    final TaskAttemptID taId = new TaskAttemptID("foo", jobId, true, partition, 0);
    final TaskAttemptContext taskAttemptContext = helper.createTaskAttemptContext(taId);

    final int framesLimit = helper.getSortFrameLimit(ctx);
    final IBinaryComparatorFactory[] comparatorFactories = helper.getSortComparatorFactories();

    class SortingRecordWriter extends RecordWriter<K2, V2> {
        private final ArrayTupleBuilder tb;
        private final ByteBuffer frame;
        private final FrameTupleAppender fta;
        private ExternalSortRunGenerator runGen;
        private int blockId;

        public SortingRecordWriter() throws HyracksDataException {
            tb = new ArrayTupleBuilder(2);
            frame = ctx.allocateFrame();
            fta = new FrameTupleAppender(ctx.getFrameSize());
            fta.reset(frame, true);/*from  w ww  .j a v a2s . c  om*/
        }

        public void initBlock(int blockId) throws HyracksDataException {
            runGen = new ExternalSortRunGenerator(ctx, new int[] { 0 }, null, comparatorFactories,
                    helper.getMapOutputRecordDescriptorWithoutExtraFields(), Algorithm.MERGE_SORT, framesLimit);
            this.blockId = blockId;
        }

        @Override
        public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
        }

        @Override
        public void write(K2 key, V2 value) throws IOException, InterruptedException {
            DataOutput dos = tb.getDataOutput();
            tb.reset();
            key.write(dos);
            tb.addFieldEndOffset();
            value.write(dos);
            tb.addFieldEndOffset();
            if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                runGen.nextFrame(frame);
                fta.reset(frame, true);
                if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                    throw new HyracksDataException("Record size (" + tb.getSize() + ") larger than frame size ("
                            + frame.capacity() + ")");
                }
            }
        }

        public void sortAndFlushBlock(final IFrameWriter writer) throws HyracksDataException {
            if (fta.getTupleCount() > 0) {
                runGen.nextFrame(frame);
                fta.reset(frame, true);
            }
            runGen.close();
            IFrameWriter delegatingWriter = new IFrameWriter() {
                private final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize());
                private final ByteBuffer outFrame = ctx.allocateFrame();
                private final FrameTupleAccessor fta = new FrameTupleAccessor(ctx.getFrameSize(),
                        helper.getMapOutputRecordDescriptorWithoutExtraFields());
                private final ArrayTupleBuilder tb = new ArrayTupleBuilder(3);

                @Override
                public void open() throws HyracksDataException {
                    appender.reset(outFrame, true);
                }

                @Override
                public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
                    fta.reset(buffer);
                    int n = fta.getTupleCount();
                    for (int i = 0; i < n; ++i) {
                        tb.reset();
                        tb.addField(fta, i, 0);
                        tb.addField(fta, i, 1);
                        try {
                            tb.getDataOutput().writeInt(blockId);
                        } catch (IOException e) {
                            throw new HyracksDataException(e);
                        }
                        tb.addFieldEndOffset();
                        if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                            FrameUtils.flushFrame(outFrame, writer);
                            appender.reset(outFrame, true);
                            if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                                throw new IllegalStateException();
                            }
                        }
                    }
                }

                @Override
                public void close() throws HyracksDataException {
                    if (appender.getTupleCount() > 0) {
                        FrameUtils.flushFrame(outFrame, writer);
                    }
                }

                @Override
                public void fail() throws HyracksDataException {
                    // TODO Auto-generated method stub

                }
            };
            if (helper.hasCombiner()) {
                Reducer<K2, V2, K2, V2> combiner = helper.getCombiner();
                TaskAttemptID ctaId = new TaskAttemptID("foo", jobId, true, partition, 0);
                TaskAttemptContext ctaskAttemptContext = helper.createTaskAttemptContext(taId);
                final IFrameWriter outputWriter = delegatingWriter;
                RecordWriter<K2, V2> recordWriter = new RecordWriter<K2, V2>() {
                    private final FrameTupleAppender fta = new FrameTupleAppender(ctx.getFrameSize());
                    private final ByteBuffer buffer = ctx.allocateFrame();
                    private final ArrayTupleBuilder tb = new ArrayTupleBuilder(2);

                    {
                        fta.reset(buffer, true);
                        outputWriter.open();
                    }

                    @Override
                    public void write(K2 key, V2 value) throws IOException, InterruptedException {
                        DataOutput dos = tb.getDataOutput();
                        tb.reset();
                        key.write(dos);
                        tb.addFieldEndOffset();
                        value.write(dos);
                        tb.addFieldEndOffset();
                        if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                            FrameUtils.flushFrame(buffer, outputWriter);
                            fta.reset(buffer, true);
                            if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                                throw new IllegalStateException();
                            }
                        }
                    }

                    @Override
                    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
                        if (fta.getTupleCount() > 0) {
                            FrameUtils.flushFrame(buffer, outputWriter);
                            outputWriter.close();
                        }
                    }
                };
                delegatingWriter = new ReduceWriter<K2, V2, K2, V2>(ctx, helper,
                        new int[] { HadoopHelper.KEY_FIELD_INDEX }, helper.getGroupingComparatorFactories(),
                        helper.getMapOutputRecordDescriptorWithoutExtraFields(), combiner, recordWriter, ctaId,
                        ctaskAttemptContext);
            }
            IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length];
            for (int i = 0; i < comparatorFactories.length; ++i) {
                comparators[i] = comparatorFactories[i].createBinaryComparator();
            }
            ExternalSortRunMerger merger = new ExternalSortRunMerger(ctx, runGen.getFrameSorter(),
                    runGen.getRuns(), new int[] { 0 }, comparators, null,
                    helper.getMapOutputRecordDescriptorWithoutExtraFields(), framesLimit, delegatingWriter);
            merger.process();
        }
    }

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        @Override
        public void initialize() throws HyracksDataException {
            writer.open();
            try {
                SortingRecordWriter recordWriter = new SortingRecordWriter();
                InputSplit split = null;
                int blockId = 0;
                while ((split = isp.next()) != null) {
                    try {
                        RecordReader<K1, V1> recordReader = inputFormat.createRecordReader(split,
                                taskAttemptContext);
                        ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
                        try {
                            Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
                            recordReader.initialize(split, taskAttemptContext);
                        } finally {
                            Thread.currentThread().setContextClassLoader(ctxCL);
                        }
                        recordWriter.initBlock(blockId);
                        Mapper<K1, V1, K2, V2>.Context mCtx = new MRContextUtil().createMapContext(conf, taId,
                                recordReader, recordWriter, null, null, split);
                        mapper.run(mCtx);
                        recordReader.close();
                        recordWriter.sortAndFlushBlock(writer);
                        ++blockId;
                    } catch (IOException e) {
                        throw new HyracksDataException(e);
                    } catch (InterruptedException e) {
                        throw new HyracksDataException(e);
                    }
                }
            } finally {
                writer.close();
            }
        }
    };
}

From source file:edu.uci.ics.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java

License:Apache License

@Override
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final List<FileSplit> inputSplits = splitsFactory.getSplits();

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId();
        private ContextFactory ctxFactory = new ContextFactory();

        @SuppressWarnings("unchecked")
        @Override/*from   ww  w .j a  v a2s .c o m*/
        public void initialize() throws HyracksDataException {
            ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
            try {
                Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader());
                Job job = confFactory.getConf();
                job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx);
                writer.open();
                InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(),
                        job.getConfiguration());
                int size = inputSplits.size();
                for (int i = 0; i < size; i++) {
                    /**
                     * read all the partitions scheduled to the current node
                     */
                    if (scheduledLocations[i].equals(nodeName)) {
                        /**
                         * pick an unread split to read synchronize among
                         * simultaneous partitions in the same machine
                         */
                        synchronized (executed) {
                            if (executed[i] == false) {
                                executed[i] = true;
                            } else {
                                continue;
                            }
                        }

                        /**
                         * read the split
                         */
                        TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i);
                        context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader());
                        RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context);
                        reader.initialize(inputSplits.get(i), context);
                        while (reader.nextKeyValue() == true) {
                            parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer,
                                    inputSplits.get(i).toString());
                        }
                    }
                }
                parser.close(writer);
                writer.close();
            } catch (Exception e) {
                throw new HyracksDataException(e);
            } finally {
                Thread.currentThread().setContextClassLoader(ctxCL);
            }
        }
    };
}

From source file:eu.scape_project.tb.wc.archd.test.ARCTest.java

License:Apache License

/**
 * Test of nextKeyValue method, of class ArcRecordReader.
 *///  w w w .ja v a2s.  c  o  m
public void testNextKeyValue() throws Exception {
    RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac);
    recordReader.initialize(split, tac);
    int start = 1;
    while (recordReader.nextKeyValue()) {
        Text currKey = recordReader.getCurrentKey();
        ArcRecord currValue = recordReader.getCurrentValue();

        String currMIMEType = currValue.getMimeType();
        String currType = currValue.getType();
        String currURL = currValue.getUrl();
        InputStream currStream = currValue.getContents();
        String currContent;
        String myContentString;
        int myContentStringIndex;
        Date currDate = currValue.getDate();
        int currHTTPrc = currValue.getHttpReturnCode();
        int currLength = currValue.getLength();

        System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: "
                + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc
                + " Length: " + currLength);

        // check example record 1 (first one and the header of the ARC file)
        if (start == 1) {
            //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream.
            //We will search for the string int the content we read and compare it to the values we know.                
            currContent = content2String(currStream);
            myContentString = "defaultgz_orderxml";
            myContentStringIndex = currContent.indexOf(myContentString);
            //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex);

            assertEquals("ID not equal", "20130522085320/filedesc://3-2-20130522085320-00000-prepc2.arc",
                    currKey.toString());
            assertEquals("MIME Type not equal", "text/plain", currMIMEType);
            assertEquals("Response type not equal", "response", currType);
            assertEquals("URL not equal", "filedesc://3-2-20130522085320-00000-prepc2.arc", currURL);
            assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 08:53:20"));
            assertEquals("HTTPrc not equal", -1, currHTTPrc);
            assertEquals("Record length not equal", 1190, currLength);
            assertEquals("Content seems not to be correct", 531, myContentStringIndex);
        }
        start++;
    }
}

From source file:eu.scape_project.tb.wc.archd.test.WARCTest.java

License:Apache License

/**
 * Test of nextKeyValue method, of class ArcRecordReader.
 *//*from ww w.j a  v  a  2s  .c o m*/
public void testNextKeyValue() throws Exception {
    RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac);
    recordReader.initialize(split, tac);
    int start = 1;
    while (recordReader.nextKeyValue()) {
        Text currKey = recordReader.getCurrentKey();
        ArcRecord currValue = recordReader.getCurrentValue();

        String currMIMEType = currValue.getMimeType();
        String currType = currValue.getType();
        String currURL = currValue.getUrl();
        InputStream currStream = currValue.getContents();
        String currContent;
        String myContentString;
        int myContentStringIndex;
        Date currDate = currValue.getDate();
        int currHTTPrc = currValue.getHttpReturnCode();
        int currLength = currValue.getLength();

        System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: "
                + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc
                + " Length: " + currLength);

        // check example record 1 (first one and the header of the WARC file)
        if (start == 1) {
            //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream.
            //We will search for the string int the content we read and compare it to the values we know.                
            currContent = content2String(currStream);
            myContentString = "isPartOf: basic";
            myContentStringIndex = currContent.indexOf(myContentString);
            //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex);

            assertEquals("ID not equal", "<urn:uuid:18cfb53d-1c89-4cc6-863f-e5535d430c95>", currKey.toString());
            assertEquals("MIME Type not equal", "application/warc-fields", currMIMEType);
            assertEquals("Response type not equal", "warcinfo", currType);
            assertEquals("URL not equal", null, currURL);
            assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 12:27:40"));
            assertEquals("HTTPrc not equal", -1, currHTTPrc);
            assertEquals("Record length not equal", 374, currLength);
            assertEquals("Content mismatch", 202, myContentStringIndex);
        }
        start++;
    }
}

From source file:fi.tkk.ics.hadoop.bam.BAMInputFormat.java

License:Open Source License

/** Returns a {@link BAMRecordReader} initialized with the parameters. */
@Override/*from  w w w .  jav  a  2s.  co  m*/
public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(InputSplit split,
        TaskAttemptContext ctx) throws InterruptedException, IOException {
    final RecordReader<LongWritable, SAMRecordWritable> rr = new BAMRecordReader();
    rr.initialize(split, ctx);
    return rr;
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java

License:Open Source License

@Override
public RecordReader<LongWritable, Range> createRecordReader(InputSplit split, TaskAttemptContext ctx)
        throws InterruptedException, IOException {
    initBaseIF(ContextUtil.getConfiguration(ctx));

    final RecordReader<LongWritable, Range> rr = new SummarizeRecordReader(
            baseIF.createRecordReader(split, ctx));
    rr.initialize(split, ctx);
    return rr;//from  ww  w.j av  a 2  s .c  o m
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.SummarySort.java

License:Open Source License

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext ctx)
        throws InterruptedException, IOException {
    final RecordReader<LongWritable, Text> rr = new SortRecordReader();
    rr.initialize(split, ctx);
    return rr;/*from   www .j  a v a 2s. com*/
}

From source file:fi.tkk.ics.hadoop.bam.cli.plugins.Sort.java

License:Open Source License

@Override
public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(InputSplit split,
        TaskAttemptContext ctx) throws InterruptedException, IOException {
    initBaseIF(ContextUtil.getConfiguration(ctx));

    final RecordReader<LongWritable, SAMRecordWritable> rr = new SortRecordReader(
            baseIF.createRecordReader(split, ctx));
    rr.initialize(split, ctx);
    return rr;//from   ww  w. ja v a2 s .  c  o m
}

From source file:fi.tkk.ics.hadoop.bam.SAMInputFormat.java

License:Open Source License

/** Returns a {@link SAMRecordReader} initialized with the parameters. */
@Override//from  w w  w  .j  a v  a  2 s .  c om
public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(InputSplit split,
        TaskAttemptContext ctx) throws InterruptedException, IOException {
    final RecordReader<LongWritable, SAMRecordWritable> rr = new SAMRecordReader();
    rr.initialize(split, ctx);
    return rr;
}