Example usage for org.apache.hadoop.mapreduce RecordReader close

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader close.

Prototype

public abstract void close() throws IOException;

Source Link

Document

Close the record reader.

Usage

From source file:org.apache.avro.mapreduce.TestAvroKeyValueRecordReader.java

License:Apache License

/**
 * Verifies that avro records can be read and progress is reported correctly.
 *//*from ww w  .ja  v a2s . co m*/
@Test
public void testReadRecords() throws IOException, InterruptedException {
    // Create the test avro file input with two records:
    //   1. <"firstkey", 1>
    //   2. <"second", 2>
    Schema keyValueSchema = AvroKeyValue.getSchema(Schema.create(Schema.Type.STRING),
            Schema.create(Schema.Type.INT));

    AvroKeyValue<CharSequence, Integer> firstInputRecord = new AvroKeyValue<CharSequence, Integer>(
            new GenericData.Record(keyValueSchema));
    firstInputRecord.setKey("first");
    firstInputRecord.setValue(1);

    AvroKeyValue<CharSequence, Integer> secondInputRecord = new AvroKeyValue<CharSequence, Integer>(
            new GenericData.Record(keyValueSchema));
    secondInputRecord.setKey("second");
    secondInputRecord.setValue(2);

    final SeekableInput avroFileInput = new SeekableFileInput(
            AvroFiles.createFile(new File(mTempDir.getRoot(), "myInputFile.avro"), keyValueSchema,
                    firstInputRecord.get(), secondInputRecord.get()));

    // Create the record reader over the avro input file.
    RecordReader<AvroKey<CharSequence>, AvroValue<Integer>> recordReader = new AvroKeyValueRecordReader<CharSequence, Integer>(
            Schema.create(Schema.Type.STRING), Schema.create(Schema.Type.INT)) {
        @Override
        protected SeekableInput createSeekableInput(Configuration conf, Path path) throws IOException {
            return avroFileInput;
        }
    };

    // Set up the job configuration.
    Configuration conf = new Configuration();

    // Create a mock input split for this record reader.
    FileSplit inputSplit = createMock(FileSplit.class);
    expect(inputSplit.getPath()).andReturn(new Path("/path/to/an/avro/file")).anyTimes();
    expect(inputSplit.getStart()).andReturn(0L).anyTimes();
    expect(inputSplit.getLength()).andReturn(avroFileInput.length()).anyTimes();

    // Create a mock task attempt context for this record reader.
    TaskAttemptContext context = createMock(TaskAttemptContext.class);
    expect(context.getConfiguration()).andReturn(conf).anyTimes();

    // Initialize the record reader.
    replay(inputSplit);
    replay(context);
    recordReader.initialize(inputSplit, context);

    assertEquals("Progress should be zero before any records are read", 0.0f, recordReader.getProgress(), 0.0f);

    // Some variables to hold the records.
    AvroKey<CharSequence> key;
    AvroValue<Integer> value;

    // Read the first record.
    assertTrue("Expected at least one record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("First record had null key", key);
    assertNotNull("First record had null value", value);

    assertEquals("first", key.datum().toString());
    assertEquals(1, value.datum().intValue());

    assertTrue("getCurrentKey() returned different keys for the same record",
            key == recordReader.getCurrentKey());
    assertTrue("getCurrentValue() returned different values for the same record",
            value == recordReader.getCurrentValue());

    // Read the second record.
    assertTrue("Expected to read a second record", recordReader.nextKeyValue());
    key = recordReader.getCurrentKey();
    value = recordReader.getCurrentValue();

    assertNotNull("Second record had null key", key);
    assertNotNull("Second record had null value", value);

    assertEquals("second", key.datum().toString());
    assertEquals(2, value.datum().intValue());

    assertEquals("Progress should be complete (2 out of 2 records processed)", 1.0f, recordReader.getProgress(),
            0.0f);

    // There should be no more records.
    assertFalse("Expected only 2 records", recordReader.nextKeyValue());

    // Close the record reader.
    recordReader.close();

    // Verify the expected calls on the mocks.
    verify(inputSplit);
    verify(context);
}

From source file:org.apache.carbondata.store.LocalCarbonStore.java

License:Apache License

@Override
public Iterator<CarbonRow> scan(AbsoluteTableIdentifier tableIdentifier, String[] projectColumns,
        Expression filter) throws IOException {
    Objects.requireNonNull(tableIdentifier);
    Objects.requireNonNull(projectColumns);

    CarbonTable table = getTable(tableIdentifier.getTablePath());
    if (table.isStreamingSink() || table.isHivePartitionTable()) {
        throw new UnsupportedOperationException("streaming and partition table is not supported");
    }//from   w  ww  . j a  va  2  s  . co m
    // TODO: use InputFormat to prune data and read data

    final CarbonTableInputFormat format = new CarbonTableInputFormat();
    final Job job = new Job(new Configuration());
    CarbonInputFormat.setTableInfo(job.getConfiguration(), table.getTableInfo());
    CarbonInputFormat.setTablePath(job.getConfiguration(), table.getTablePath());
    CarbonInputFormat.setTableName(job.getConfiguration(), table.getTableName());
    CarbonInputFormat.setDatabaseName(job.getConfiguration(), table.getDatabaseName());
    CarbonInputFormat.setCarbonReadSupport(job.getConfiguration(), CarbonRowReadSupport.class);
    CarbonInputFormat.setColumnProjection(job.getConfiguration(), new CarbonProjection(projectColumns));
    if (filter != null) {
        CarbonInputFormat.setFilterPredicates(job.getConfiguration(), filter);
    }

    final List<InputSplit> splits = format.getSplits(new JobContextImpl(job.getConfiguration(), new JobID()));

    List<RecordReader<Void, Object>> readers = new ArrayList<>(splits.size());

    List<CarbonRow> rows = new ArrayList<>();

    try {
        for (InputSplit split : splits) {
            TaskAttemptContextImpl attempt = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader reader = format.createRecordReader(split, attempt);
            reader.initialize(split, attempt);
            readers.add(reader);
        }

        for (RecordReader<Void, Object> reader : readers) {
            while (reader.nextKeyValue()) {
                rows.add((CarbonRow) reader.getCurrentValue());
            }
            try {
                reader.close();
            } catch (IOException e) {
                LOGGER.error(e);
            }
        }
    } catch (InterruptedException e) {
        throw new IOException(e);
    } finally {
        for (RecordReader<Void, Object> reader : readers) {
            try {
                reader.close();
            } catch (IOException e) {
                LOGGER.error(e);
            }
        }
    }
    return rows.iterator();
}

From source file:org.apache.crunch.kafka.inputformat.KafkaInputFormatIT.java

License:Apache License

@Test
public void getSplitsCreateReaders() throws IOException, InterruptedException {
    List<String> keys = ClusterTest.writeData(ClusterTest.getProducerProperties(), topic, "batch", 10, 10);
    Map<TopicPartition, Long> startOffsets = getBrokerOffsets(consumerProps, OffsetRequest.EarliestTime(),
            topic);/*from  w w  w. j  av  a2s. co m*/
    Map<TopicPartition, Long> endOffsets = getBrokerOffsets(consumerProps, OffsetRequest.LatestTime(), topic);

    Map<TopicPartition, Pair<Long, Long>> offsets = new HashMap<>();
    for (Map.Entry<TopicPartition, Long> entry : startOffsets.entrySet()) {
        Long endingOffset = endOffsets.get(entry.getKey());
        offsets.put(entry.getKey(), Pair.of(entry.getValue(), endingOffset));
    }

    KafkaInputFormat.writeOffsetsToConfiguration(offsets, config);

    KafkaInputFormat inputFormat = new KafkaInputFormat();
    inputFormat.setConf(config);
    List<InputSplit> splits = inputFormat.getSplits(null);

    assertThat(splits.size(), is(offsets.size()));

    for (InputSplit split : splits) {
        KafkaInputSplit inputSplit = (KafkaInputSplit) split;
        Pair<Long, Long> startEnd = offsets.get(inputSplit.getTopicPartition());
        assertThat(inputSplit.getStartingOffset(), is(startEnd.first()));
        assertThat(inputSplit.getEndingOffset(), is(startEnd.second()));
    }

    //create readers and consume the data
    when(taskContext.getConfiguration()).thenReturn(config);
    Set<String> keysRead = new HashSet<>();
    //read all data from all splits
    for (InputSplit split : splits) {
        KafkaInputSplit inputSplit = (KafkaInputSplit) split;
        long start = inputSplit.getStartingOffset();
        long end = inputSplit.getEndingOffset();

        RecordReader<BytesWritable, BytesWritable> recordReader = inputFormat.createRecordReader(split,
                taskContext);
        recordReader.initialize(split, taskContext);

        int numRecordsFound = 0;
        String currentKey;
        while (recordReader.nextKeyValue()) {
            currentKey = new String(recordReader.getCurrentKey().getBytes());
            keysRead.add(currentKey);
            assertThat(keys, hasItem(currentKey));
            assertThat(recordReader.getCurrentValue(), is(notNullValue()));
            numRecordsFound++;
        }
        recordReader.close();

        //assert that it encountered a partitions worth of data
        assertThat(((long) numRecordsFound), is(end - start));
    }

    //validate the same number of unique keys was read as were written.
    assertThat(keysRead.size(), is(keys.size()));
}

From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testRead() throws IOException, InterruptedException {
    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser();

    reader.initialize(split, context);//from  w  ww.ja  v a  2  s  .  c  o  m

    reader.nextKeyValue();

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}

From source file:org.apache.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testReadDateColumn() throws IOException, InterruptedException {
    File testFile2 = makeOrcFileWithDate();
    Path path = new Path(testFile2.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile2.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    InputRowParser<OrcStruct> parser = (InputRowParser<OrcStruct>) config.getParser();

    reader.initialize(split, context);// ww w  .jav a  2 s  . co m

    reader.nextKeyValue();

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parseBatch(data).get(0);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(DateTimes.of(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}

From source file:org.apache.hcatalog.pig.TestE2EScenarios.java

License:Apache License

private void copyTable(String in, String out) throws IOException, InterruptedException {
    Job ijob = new Job();
    Job ojob = new Job();
    HCatInputFormat inpy = new HCatInputFormat();
    inpy.setInput(ijob, null, in);/*  ww w.j ava 2 s . c  o  m*/
    HCatOutputFormat oupy = new HCatOutputFormat();
    oupy.setOutput(ojob, OutputJobInfo.create(null, out, new HashMap<String, String>()));

    // Test HCatContext

    System.err.println("HCatContext INSTANCE is present : " + HCatContext.INSTANCE.getConf().isPresent());
    if (HCatContext.INSTANCE.getConf().isPresent()) {
        System.err.println("HCatContext tinyint->int promotion says " + HCatContext.INSTANCE.getConf().get()
                .getBoolean(HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION,
                        HCatConstants.HCAT_DATA_TINY_SMALL_INT_PROMOTION_DEFAULT));
    }

    HCatSchema tableSchema = inpy.getTableSchema(ijob.getConfiguration());
    System.err.println("Copying from [" + in + "] to [" + out + "] with schema : " + tableSchema.toString());
    oupy.setSchema(ojob, tableSchema);
    oupy.checkOutputSpecs(ojob);
    OutputCommitter oc = oupy.getOutputCommitter(createTaskAttemptContext(ojob.getConfiguration()));
    oc.setupJob(ojob);

    for (InputSplit split : inpy.getSplits(ijob)) {

        TaskAttemptContext rtaskContext = createTaskAttemptContext(ijob.getConfiguration());
        TaskAttemptContext wtaskContext = createTaskAttemptContext(ojob.getConfiguration());

        RecordReader<WritableComparable, HCatRecord> rr = inpy.createRecordReader(split, rtaskContext);
        rr.initialize(split, rtaskContext);

        OutputCommitter taskOc = oupy.getOutputCommitter(wtaskContext);
        taskOc.setupTask(wtaskContext);
        RecordWriter<WritableComparable<?>, HCatRecord> rw = oupy.getRecordWriter(wtaskContext);

        while (rr.nextKeyValue()) {
            rw.write(rr.getCurrentKey(), rr.getCurrentValue());
        }
        rw.close(wtaskContext);
        taskOc.commitTask(wtaskContext);
        rr.close();
    }

    oc.commitJob(ojob);
}

From source file:org.apache.hyracks.dataflow.hadoop.mapreduce.MapperOperatorDescriptor.java

License:Apache License

@SuppressWarnings("deprecation")
@Override/*from  w w w  .j av  a  2 s. c om*/
public IOperatorNodePushable createPushRuntime(final IHyracksTaskContext ctx,
        IRecordDescriptorProvider recordDescProvider, final int partition, final int nPartitions)
        throws HyracksDataException {
    final HadoopHelper helper = new HadoopHelper(config);
    final Configuration conf = helper.getConfiguration();
    final Mapper<K1, V1, K2, V2> mapper = helper.getMapper();
    final InputFormat<K1, V1> inputFormat = helper.getInputFormat();
    final IInputSplitProvider isp = factory.createInputSplitProvider(partition);
    final TaskAttemptID taId = new TaskAttemptID("foo", jobId, true, partition, 0);
    final TaskAttemptContext taskAttemptContext = helper.createTaskAttemptContext(taId);

    final int framesLimit = helper.getSortFrameLimit(ctx);
    final IBinaryComparatorFactory[] comparatorFactories = helper.getSortComparatorFactories();

    class SortingRecordWriter extends RecordWriter<K2, V2> {
        private final ArrayTupleBuilder tb;
        private final IFrame frame;
        private final FrameTupleAppender fta;
        private ExternalSortRunGenerator runGen;
        private int blockId;

        public SortingRecordWriter() throws HyracksDataException {
            tb = new ArrayTupleBuilder(2);
            frame = new VSizeFrame(ctx);
            fta = new FrameTupleAppender(frame);
        }

        public void initBlock(int blockId) throws HyracksDataException {
            runGen = new ExternalSortRunGenerator(ctx, new int[] { 0 }, null, comparatorFactories,
                    helper.getMapOutputRecordDescriptorWithoutExtraFields(), Algorithm.MERGE_SORT, framesLimit);
            this.blockId = blockId;
        }

        @Override
        public void close(TaskAttemptContext arg0) throws IOException, InterruptedException {
        }

        @Override
        public void write(K2 key, V2 value) throws IOException, InterruptedException {
            DataOutput dos = tb.getDataOutput();
            tb.reset();
            key.write(dos);
            tb.addFieldEndOffset();
            value.write(dos);
            tb.addFieldEndOffset();
            if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                runGen.nextFrame(frame.getBuffer());
                fta.reset(frame, true);
                if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                    throw new HyracksDataException("Record size (" + tb.getSize() + ") larger than frame size ("
                            + frame.getBuffer().capacity() + ")");
                }
            }
        }

        public void sortAndFlushBlock(final IFrameWriter writer) throws HyracksDataException {
            if (fta.getTupleCount() > 0) {
                runGen.nextFrame(frame.getBuffer());
                fta.reset(frame, true);
            }
            runGen.close();
            IFrameWriter delegatingWriter = new IFrameWriter() {
                private final FrameTupleAppender appender = new FrameTupleAppender(new VSizeFrame(ctx));
                private final FrameTupleAccessor fta = new FrameTupleAccessor(
                        helper.getMapOutputRecordDescriptorWithoutExtraFields());
                private final ArrayTupleBuilder tb = new ArrayTupleBuilder(3);

                @Override
                public void open() throws HyracksDataException {
                }

                @Override
                public void nextFrame(ByteBuffer buffer) throws HyracksDataException {
                    fta.reset(buffer);
                    int n = fta.getTupleCount();
                    for (int i = 0; i < n; ++i) {
                        tb.reset();
                        tb.addField(fta, i, 0);
                        tb.addField(fta, i, 1);
                        try {
                            tb.getDataOutput().writeInt(blockId);
                        } catch (IOException e) {
                            throw new HyracksDataException(e);
                        }
                        tb.addFieldEndOffset();
                        if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                            appender.flush(writer, true);
                            if (!appender.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                                throw new IllegalStateException();
                            }
                        }
                    }
                }

                @Override
                public void close() throws HyracksDataException {
                    appender.flush(writer, true);
                }

                @Override
                public void fail() throws HyracksDataException {
                    // TODO Auto-generated method stub

                }
            };
            if (helper.hasCombiner()) {
                Reducer<K2, V2, K2, V2> combiner = helper.getCombiner();
                TaskAttemptID ctaId = new TaskAttemptID("foo", jobId, true, partition, 0);
                TaskAttemptContext ctaskAttemptContext = helper.createTaskAttemptContext(taId);
                final IFrameWriter outputWriter = delegatingWriter;
                RecordWriter<K2, V2> recordWriter = new RecordWriter<K2, V2>() {
                    private final FrameTupleAppender fta = new FrameTupleAppender(new VSizeFrame(ctx));
                    private final ArrayTupleBuilder tb = new ArrayTupleBuilder(2);

                    {
                        outputWriter.open();
                    }

                    @Override
                    public void write(K2 key, V2 value) throws IOException, InterruptedException {
                        DataOutput dos = tb.getDataOutput();
                        tb.reset();
                        key.write(dos);
                        tb.addFieldEndOffset();
                        value.write(dos);
                        tb.addFieldEndOffset();
                        if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                            fta.flush(outputWriter, true);
                            if (!fta.append(tb.getFieldEndOffsets(), tb.getByteArray(), 0, tb.getSize())) {
                                throw new IllegalStateException();
                            }
                        }
                    }

                    @Override
                    public void close(TaskAttemptContext context) throws IOException, InterruptedException {
                        fta.flush(outputWriter, true);
                    }
                };
                delegatingWriter = new ReduceWriter<K2, V2, K2, V2>(ctx, helper,
                        new int[] { HadoopHelper.KEY_FIELD_INDEX }, helper.getGroupingComparatorFactories(),
                        helper.getMapOutputRecordDescriptorWithoutExtraFields(), combiner, recordWriter, ctaId,
                        ctaskAttemptContext);
            }
            IBinaryComparator[] comparators = new IBinaryComparator[comparatorFactories.length];
            for (int i = 0; i < comparatorFactories.length; ++i) {
                comparators[i] = comparatorFactories[i].createBinaryComparator();
            }
            ExternalSortRunMerger merger = new ExternalSortRunMerger(ctx, runGen.getSorter(), runGen.getRuns(),
                    new int[] { 0 }, comparators, null, helper.getMapOutputRecordDescriptorWithoutExtraFields(),
                    framesLimit, delegatingWriter);
            merger.process();
        }
    }

    return new AbstractUnaryOutputSourceOperatorNodePushable() {
        @SuppressWarnings("unchecked")
        @Override
        public void initialize() throws HyracksDataException {
            try {
                writer.open();
                SortingRecordWriter recordWriter = new SortingRecordWriter();
                InputSplit split = null;
                int blockId = 0;
                while ((split = isp.next()) != null) {
                    try {
                        RecordReader<K1, V1> recordReader = inputFormat.createRecordReader(split,
                                taskAttemptContext);
                        ClassLoader ctxCL = Thread.currentThread().getContextClassLoader();
                        try {
                            Thread.currentThread().setContextClassLoader(getClass().getClassLoader());
                            recordReader.initialize(split, taskAttemptContext);
                        } finally {
                            Thread.currentThread().setContextClassLoader(ctxCL);
                        }
                        recordWriter.initBlock(blockId);
                        Mapper<K1, V1, K2, V2>.Context mCtx = new MRContextUtil().createMapContext(conf, taId,
                                recordReader, recordWriter, null, null, split);
                        mapper.run(mCtx);
                        recordReader.close();
                        recordWriter.sortAndFlushBlock(writer);
                        ++blockId;
                    } catch (IOException e) {
                        throw new HyracksDataException(e);
                    } catch (InterruptedException e) {
                        throw new HyracksDataException(e);
                    }
                }
            } catch (Throwable th) {
                writer.fail();
                throw th;
            } finally {
                writer.close();
            }
        }
    };
}

From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNodeTupleInputFormatTests.java

License:Apache License

protected final int countTuples(RecordReader<LongWritable, T> reader) throws IOException, InterruptedException {
    int count = 0;

    // Check initial progress
    LOG.info(String.format("Initial Reported Progress %f", reader.getProgress()));
    float progress = reader.getProgress();
    if (Float.compare(0.0f, progress) == 0) {
        Assert.assertEquals(0.0d, reader.getProgress(), 0.0d);
    } else if (Float.compare(1.0f, progress) == 0) {
        // If reader is reported 1.0 straight away then we expect there to
        // be no key values
        Assert.assertEquals(1.0d, reader.getProgress(), 0.0d);
        Assert.assertFalse(reader.nextKeyValue());
    } else {// ww  w .  j a  v  a  2s.  c om
        Assert.fail(String.format(
                "Expected progress of 0.0 or 1.0 before reader has been accessed for first time but got %f",
                progress));
    }

    // Count tuples
    boolean debug = LOG.isDebugEnabled();
    while (reader.nextKeyValue()) {
        count++;
        progress = reader.getProgress();
        if (debug)
            LOG.debug(String.format("Current Reported Progress %f", progress));
        Assert.assertTrue(String.format("Progress should be in the range 0.0 < p <= 1.0 but got %f", progress),
                progress > 0.0f && progress <= 1.0f);
    }
    reader.close();
    LOG.info(String.format("Got %d tuples from this record reader", count));

    // Check final progress
    LOG.info(String.format("Final Reported Progress %f", reader.getProgress()));
    Assert.assertEquals(1.0d, reader.getProgress(), 0.0d);

    return count;
}

From source file:org.apache.jena.tdbloader4.partitioners.SplitSampler.java

License:Apache License

/**
 * From each split sampled, take the first numSamples / numSplits records.
 *///  w  w  w  . j  a v a 2s  . c  o m
@SuppressWarnings("unchecked")
// ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
    List<InputSplit> splits = inf.getSplits(job);
    ArrayList<K> samples = new ArrayList<K>(numSamples);
    int splitsToSample = Math.min(maxSplitsSampled, splits.size());
    int samplesPerSplit = numSamples / splitsToSample;
    log.debug("Sampling {} splits, taking {} samples per split", splitsToSample, samplesPerSplit);
    long records = 0;
    for (int i = 0; i < splitsToSample; ++i) {
        TaskAttemptContext samplingContext = new TaskAttemptContext(job.getConfiguration(),
                new TaskAttemptID());
        InputSplit split = splits.get(i);
        log.debug("Sampling {} split", split);
        RecordReader<K, V> reader = inf.createRecordReader(split, samplingContext);
        reader.initialize(split, samplingContext);
        while (reader.nextKeyValue()) {
            LongQuadWritable currentKey = (LongQuadWritable) reader.getCurrentKey();
            // TODO: why do we need to do that? Why on earth we have -1 in subject, predicate or object position???
            if ((currentKey.get(0) > 0) && (currentKey.get(1) > 0) && (currentKey.get(2) > 0)) {
                LongQuadWritable key = new LongQuadWritable(currentKey.get(0), currentKey.get(1),
                        currentKey.get(2), currentKey.get(3));
                log.debug("Sampled {}", key);
                samples.add((K) key);
                ++records;
                if (records >= (i + 1) * samplesPerSplit) {
                    log.debug("Records is {} and (i + 1) * samplesPerSplit is {}", records,
                            (i + 1) * samplesPerSplit);
                    break;
                }
            }
        }
        reader.close();
    }
    return (K[]) samples.toArray();
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceBufferDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteBufferData" })
public void testReadBufferData() throws Exception {
    long reccnt = 0L;
    long tsize = 0L;
    byte[] buf;/*from   w  ww  . j  av  a2  s  . com*/
    Checksum cs = new CRC32();
    cs.reset();
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            m_partfns.add(listfiles[idx].getName());
        }
    }
    Collections.sort(m_partfns); // keep the order for checksum
    for (int idx = 0; idx < m_partfns.size(); ++idx) {
        System.out.println(String.format("Verifying : %s", m_partfns.get(idx)));
        FileSplit split = new FileSplit(new Path(m_workdir, m_partfns.get(idx)), 0, 0L, new String[0]);
        InputFormat<NullWritable, MneDurableInputValue<DurableBuffer<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableBuffer<?>>, DurableBuffer<?>>();
        RecordReader<NullWritable, MneDurableInputValue<DurableBuffer<?>>> reader = inputFormat
                .createRecordReader(split, m_tacontext);
        MneDurableInputValue<DurableBuffer<?>> dbufval = null;
        while (reader.nextKeyValue()) {
            dbufval = reader.getCurrentValue();
            assert dbufval.getValue().getSize() == dbufval.getValue().get().capacity();
            dbufval.getValue().get().clear();
            buf = new byte[dbufval.getValue().get().capacity()];
            dbufval.getValue().get().get(buf);
            cs.update(buf, 0, buf.length);
            tsize += dbufval.getValue().getSize();
            ++reccnt;
        }
        reader.close();
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_totalsize, tsize);
    AssertJUnit.assertEquals(m_checksum, cs.getValue());
    System.out.println(String.format("The checksum of buffer is %d", m_checksum));
}