List of usage examples for org.apache.hadoop.mapreduce RecordReader initialize
public abstract void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:com.toshiba.mwcloud.gs.hadoop.mapreduce.GSRowInputFormat.java
License:Apache License
@Override public RecordReader<GSColumnKeyWritable, GSRowWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { RecordReader<GSColumnKeyWritable, GSRowWritable> reader = new GSRowRecordReader(); reader.initialize(split, context); return reader; }
From source file:edu.uci.ics.hyracks.dataflow.hadoop.mapreduce.MapperOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final HadoopHelper helper = new HadoopHelper(config); final Configuration conf = helper.getConfiguration(); final Mapper<K1, V1, K2, V2> mapper = helper.getMapper(); final InputFormat<K1, V1> inputFormat = helper.getInputFormat(); final IInputSplitProvider isp = factory.createInputSplitProvider(partition); final TaskAttemptID taId = new TaskAttemptID("foo", jobId, true, partition, 0); final TaskAttemptContext taskAttemptContext = helper.createTaskAttemptContext(taId); final int framesLimit = helper.getSortFrameLimit(ctx); final IBinaryComparatorFactory[] comparatorFactories = helper.getSortComparatorFactories(); class SortingRecordWriter extends RecordWriter<K2, V2> { private final ArrayTupleBuilder tb; private final ByteBuffer frame; private final FrameTupleAppender fta; private ExternalSortRunGenerator runGen; private int blockId; public SortingRecordWriter() throws HyracksDataException { tb = new ArrayTupleBuilder(2); frame = ctx.allocateFrame(); fta = new FrameTupleAppender(ctx.getFrameSize()); fta.reset(frame, true);/*from w ww .j a v a2s . c om*/ } public void initBlock(int blockId) throws HyracksDataException { runGen = new ExternalSortRunGenerator(ctx, new int[] { 0 }, null, comparatorFactories, helper.getMapOutputRecordDescriptorWithoutExtraFields(), Algorithm.MERGE_SORT, framesLimit); this.blockId = blockId; } @Override public void close(TaskAttemptContext arg0) throws IOException, InterruptedException { } @Override public void write(K2 key, V2 value) throws IOException, InterruptedException { DataOutput dos = tb.getDataOutput(); tb.reset(); key.write(dos); tb.addFieldEndOffset(); value.write(dos); tb.addFieldEndOffset(); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { runGen.nextFrame(frame); fta.reset(frame, true); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new HyracksDataException("Record size (" + tb.getSize() + ") larger than frame size (" + frame.capacity() + ")"); } } } public void sortAndFlushBlock(final IFrameWriter writer) throws HyracksDataException { if (fta.getTupleCount() > 0) { runGen.nextFrame(frame); fta.reset(frame, true); } runGen.close(); IFrameWriter delegatingWriter = new IFrameWriter() { private final FrameTupleAppender appender = new FrameTupleAppender(ctx.getFrameSize()); private final ByteBuffer outFrame = ctx.allocateFrame(); private final FrameTupleAccessor fta = new FrameTupleAccessor(ctx.getFrameSize(), helper.getMapOutputRecordDescriptorWithoutExtraFields()); private final ArrayTupleBuilder tb = new ArrayTupleBuilder(3); @Override public void open() throws HyracksDataException { appender.reset(outFrame, true); } @Override public void nextFrame(ByteBuffer buffer) throws HyracksDataException { fta.reset(buffer); int n = fta.getTupleCount(); for (int i = 0; i < n; ++i) { tb.reset(); tb.addField(fta, i, 0); tb.addField(fta, i, 1); try { tb.getDataOutput().writeInt(blockId); } catch (IOException e) { throw new HyracksDataException(e); } tb.addFieldEndOffset(); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { FrameUtils.flushFrame(outFrame, writer); appender.reset(outFrame, true); if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new IllegalStateException(); } } } } @Override public void close() throws HyracksDataException { if (appender.getTupleCount() > 0) { FrameUtils.flushFrame(outFrame, writer); } } @Override public void fail() throws HyracksDataException { // TODO Auto-generated method stub } }; if (helper.hasCombiner()) { Reducer<K2, V2, K2, V2> combiner = helper.getCombiner(); TaskAttemptID ctaId = new TaskAttemptID("foo", jobId, true, partition, 0); TaskAttemptContext ctaskAttemptContext = helper.createTaskAttemptContext(taId); final IFrameWriter outputWriter = delegatingWriter; RecordWriter<K2, V2> recordWriter = new RecordWriter<K2, V2>() { private final FrameTupleAppender fta = new FrameTupleAppender(ctx.getFrameSize()); private final ByteBuffer buffer = ctx.allocateFrame(); private final ArrayTupleBuilder tb = new ArrayTupleBuilder(2); { fta.reset(buffer, true); outputWriter.open(); } @Override public void write(K2 key, V2 value) throws IOException, InterruptedException { DataOutput dos = tb.getDataOutput(); tb.reset(); key.write(dos); tb.addFieldEndOffset(); value.write(dos); tb.addFieldEndOffset(); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { FrameUtils.flushFrame(buffer, outputWriter); fta.reset(buffer, true); if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) { throw new IllegalStateException(); } } } @Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { if (fta.getTupleCount() > 0) { FrameUtils.flushFrame(buffer, outputWriter); outputWriter.close(); } } }; delegatingWriter = new ReduceWriter<K2, V2, K2, V2>(ctx, helper, new int[] { HadoopHelper.KEY_FIELD_INDEX }, helper.getGroupingComparatorFactories(), helper.getMapOutputRecordDescriptorWithoutExtraFields(), combiner, recordWriter, ctaId, ctaskAttemptContext); } IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length]; for (int i = 0; i < comparatorFactories.length; ++i) { comparators[i] = comparatorFactories[i].createBinaryComparator(); } ExternalSortRunMerger merger = new ExternalSortRunMerger(ctx, runGen.getFrameSorter(), runGen.getRuns(), new int[] { 0 }, comparators, null, helper.getMapOutputRecordDescriptorWithoutExtraFields(), framesLimit, delegatingWriter); merger.process(); } } return new AbstractUnaryOutputSourceOperatorNodePushable() { @Override public void initialize() throws HyracksDataException { writer.open(); try { SortingRecordWriter recordWriter = new SortingRecordWriter(); InputSplit split = null; int blockId = 0; while ((split = isp.next()) != null) { try { RecordReader<K1, V1> recordReader = inputFormat.createRecordReader(split, taskAttemptContext); ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(getClass().getClassLoader()); recordReader.initialize(split, taskAttemptContext); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } recordWriter.initBlock(blockId); Mapper<K1, V1, K2, V2>.Context mCtx = new MRContextUtil().createMapContext(conf, taId, recordReader, recordWriter, null, null, split); mapper.run(mCtx); recordReader.close(); recordWriter.sortAndFlushBlock(writer); ++blockId; } catch (IOException e) { throw new HyracksDataException(e); } catch (InterruptedException e) { throw new HyracksDataException(e); } } } finally { writer.close(); } } }; }
From source file:edu.uci.ics.hyracks.hdfs2.dataflow.HDFSReadOperatorDescriptor.java
License:Apache License
@Override public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx, IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions) throws HyracksDataException { final List<FileSplit> inputSplits = splitsFactory.getSplits(); return new AbstractUnaryOutputSourceOperatorNodePushable() { private String nodeName = ctx.getJobletContext().getApplicationContext().getNodeId(); private ContextFactory ctxFactory = new ContextFactory(); @SuppressWarnings("unchecked") @Override/*from ww w .j a v a2s .c o m*/ public void initialize() throws HyracksDataException { ClassLoader ctxCL = Thread.currentThread().getContextClassLoader(); try { Thread.currentThread().setContextClassLoader(ctx.getJobletContext().getClassLoader()); Job job = confFactory.getConf(); job.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); IKeyValueParser parser = tupleParserFactory.createKeyValueParser(ctx); writer.open(); InputFormat inputFormat = ReflectionUtils.newInstance(job.getInputFormatClass(), job.getConfiguration()); int size = inputSplits.size(); for (int i = 0; i < size; i++) { /** * read all the partitions scheduled to the current node */ if (scheduledLocations[i].equals(nodeName)) { /** * pick an unread split to read synchronize among * simultaneous partitions in the same machine */ synchronized (executed) { if (executed[i] == false) { executed[i] = true; } else { continue; } } /** * read the split */ TaskAttemptContext context = ctxFactory.createContext(job.getConfiguration(), i); context.getConfiguration().setClassLoader(ctx.getJobletContext().getClassLoader()); RecordReader reader = inputFormat.createRecordReader(inputSplits.get(i), context); reader.initialize(inputSplits.get(i), context); while (reader.nextKeyValue() == true) { parser.parse(reader.getCurrentKey(), reader.getCurrentValue(), writer, inputSplits.get(i).toString()); } } } parser.close(writer); writer.close(); } catch (Exception e) { throw new HyracksDataException(e); } finally { Thread.currentThread().setContextClassLoader(ctxCL); } } }; }
From source file:eu.scape_project.tb.wc.archd.test.ARCTest.java
License:Apache License
/** * Test of nextKeyValue method, of class ArcRecordReader. */// w w w .ja v a2s. c o m public void testNextKeyValue() throws Exception { RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac); recordReader.initialize(split, tac); int start = 1; while (recordReader.nextKeyValue()) { Text currKey = recordReader.getCurrentKey(); ArcRecord currValue = recordReader.getCurrentValue(); String currMIMEType = currValue.getMimeType(); String currType = currValue.getType(); String currURL = currValue.getUrl(); InputStream currStream = currValue.getContents(); String currContent; String myContentString; int myContentStringIndex; Date currDate = currValue.getDate(); int currHTTPrc = currValue.getHttpReturnCode(); int currLength = currValue.getLength(); System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: " + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc + " Length: " + currLength); // check example record 1 (first one and the header of the ARC file) if (start == 1) { //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream. //We will search for the string int the content we read and compare it to the values we know. currContent = content2String(currStream); myContentString = "defaultgz_orderxml"; myContentStringIndex = currContent.indexOf(myContentString); //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex); assertEquals("ID not equal", "20130522085320/filedesc://3-2-20130522085320-00000-prepc2.arc", currKey.toString()); assertEquals("MIME Type not equal", "text/plain", currMIMEType); assertEquals("Response type not equal", "response", currType); assertEquals("URL not equal", "filedesc://3-2-20130522085320-00000-prepc2.arc", currURL); assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 08:53:20")); assertEquals("HTTPrc not equal", -1, currHTTPrc); assertEquals("Record length not equal", 1190, currLength); assertEquals("Content seems not to be correct", 531, myContentStringIndex); } start++; } }
From source file:eu.scape_project.tb.wc.archd.test.WARCTest.java
License:Apache License
/** * Test of nextKeyValue method, of class ArcRecordReader. *//*from ww w.j a v a 2s .c o m*/ public void testNextKeyValue() throws Exception { RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac); recordReader.initialize(split, tac); int start = 1; while (recordReader.nextKeyValue()) { Text currKey = recordReader.getCurrentKey(); ArcRecord currValue = recordReader.getCurrentValue(); String currMIMEType = currValue.getMimeType(); String currType = currValue.getType(); String currURL = currValue.getUrl(); InputStream currStream = currValue.getContents(); String currContent; String myContentString; int myContentStringIndex; Date currDate = currValue.getDate(); int currHTTPrc = currValue.getHttpReturnCode(); int currLength = currValue.getLength(); System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: " + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc + " Length: " + currLength); // check example record 1 (first one and the header of the WARC file) if (start == 1) { //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream. //We will search for the string int the content we read and compare it to the values we know. currContent = content2String(currStream); myContentString = "isPartOf: basic"; myContentStringIndex = currContent.indexOf(myContentString); //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex); assertEquals("ID not equal", "<urn:uuid:18cfb53d-1c89-4cc6-863f-e5535d430c95>", currKey.toString()); assertEquals("MIME Type not equal", "application/warc-fields", currMIMEType); assertEquals("Response type not equal", "warcinfo", currType); assertEquals("URL not equal", null, currURL); assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 12:27:40")); assertEquals("HTTPrc not equal", -1, currHTTPrc); assertEquals("Record length not equal", 374, currLength); assertEquals("Content mismatch", 202, myContentStringIndex); } start++; } }
From source file:fi.tkk.ics.hadoop.bam.BAMInputFormat.java
License:Open Source License
/** Returns a {@link BAMRecordReader} initialized with the parameters. */ @Override/*from w w w . jav a 2s. co m*/ public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final RecordReader<LongWritable, SAMRecordWritable> rr = new BAMRecordReader(); rr.initialize(split, ctx); return rr; }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.Summarize.java
License:Open Source License
@Override public RecordReader<LongWritable, Range> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { initBaseIF(ContextUtil.getConfiguration(ctx)); final RecordReader<LongWritable, Range> rr = new SummarizeRecordReader( baseIF.createRecordReader(split, ctx)); rr.initialize(split, ctx); return rr;//from ww w.j av a 2 s .c o m }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.chipster.SummarySort.java
License:Open Source License
@Override public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final RecordReader<LongWritable, Text> rr = new SortRecordReader(); rr.initialize(split, ctx); return rr;/*from www .j a v a 2s. com*/ }
From source file:fi.tkk.ics.hadoop.bam.cli.plugins.Sort.java
License:Open Source License
@Override public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { initBaseIF(ContextUtil.getConfiguration(ctx)); final RecordReader<LongWritable, SAMRecordWritable> rr = new SortRecordReader( baseIF.createRecordReader(split, ctx)); rr.initialize(split, ctx); return rr;//from ww w. ja v a2 s . c o m }
From source file:fi.tkk.ics.hadoop.bam.SAMInputFormat.java
License:Open Source License
/** Returns a {@link SAMRecordReader} initialized with the parameters. */ @Override//from w w w .j a v a 2 s . c om public RecordReader<LongWritable, SAMRecordWritable> createRecordReader(InputSplit split, TaskAttemptContext ctx) throws InterruptedException, IOException { final RecordReader<LongWritable, SAMRecordWritable> rr = new SAMRecordReader(); rr.initialize(split, ctx); return rr; }