Example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue

List of usage examples for org.apache.hadoop.mapreduce RecordReader nextKeyValue

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue.

Prototype

public abstract boolean nextKeyValue() throws IOException, InterruptedException;

Source Link

Document

Read the next key, value pair.

Usage

From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

/**
 * The second step uses the trees to predict the rest of the instances outside
 * their own partition/*from w w w .  j  a  v  a 2s  . co m*/
 */
protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback)
        throws IOException, InterruptedException {
    JobContext jobContext = new JobContext(conf, new JobID());

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(jobContext);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < nbSplits; p++) {
        total += Step2Mapper.nbConcerned(nbSplits, numTrees, p);
    }

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees);
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < nbSplits; partition++) {

        InputSplit split = sorted[partition];
        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);

        // load the output of the 1st step
        int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition);
        TreeID[] fsKeys = new TreeID[nbConcerned];
        Node[] fsTrees = new Node[nbConcerned];

        FileSystem fs = forestPath.getFileSystem(conf);
        int numInstances = InterResults.load(fs, forestPath, nbSplits, numTrees, partition, fsKeys, fsTrees);

        Step2Mapper mapper = new Step2Mapper();
        mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances);

        long time = System.currentTimeMillis();

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), secondOutput);
        }

        mapper.cleanup(secondOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java

License:Apache License

public void testStep0Mapper() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES);
    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(NUM_MAPS, splits.size());

    InputSplit[] sorted = new InputSplit[NUM_MAPS];
    splits.toArray(sorted);//from  w  ww .  j a  v  a  2 s  . c  o m
    Builder.sortSplits(sorted);

    Step0Context context = new Step0Context(new Step0Mapper(), job.getConfiguration(), new TaskAttemptID(),
            NUM_MAPS);

    for (int p = 0; p < NUM_MAPS; p++) {
        InputSplit split = sorted[p];

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context);
        reader.initialize(split, context);

        Step0Mapper mapper = new Step0Mapper();
        mapper.configure(p);

        Long firstKey = null;
        int size = 0;

        while (reader.nextKeyValue()) {
            LongWritable key = reader.getCurrentKey();

            if (firstKey == null) {
                firstKey = key.get();
            }

            mapper.map(key, reader.getCurrentValue(), context);

            size++;
        }

        mapper.cleanup(context);

        // validate the mapper's output
        assertEquals(p, context.keys[p]);
        assertEquals(firstKey.longValue(), context.values[p].getFirstId());
        assertEquals(size, context.values[p].getSize());
    }

}

From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java

License:Apache License

public void testProcessOutput() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES);

    // each instance label is its index in the dataset
    int labelId = Utils.findLabel(descriptor);
    for (int index = 0; index < NUM_INSTANCES; index++) {
        source[index][labelId] = index;//  w  w  w .j a  v  a  2s .  c  o m
    }

    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(NUM_MAPS, splits.size());

    InputSplit[] sorted = new InputSplit[NUM_MAPS];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    List<Integer> keys = new ArrayList<Integer>();
    List<Step0Output> values = new ArrayList<Step0Output>();

    int[] expectedIds = new int[NUM_MAPS];

    TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID());

    for (int p = 0; p < NUM_MAPS; p++) {
        InputSplit split = sorted[p];
        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context);
        reader.initialize(split, context);

        Long firstKey = null;
        int size = 0;

        while (reader.nextKeyValue()) {
            LongWritable key = reader.getCurrentKey();
            Text value = reader.getCurrentValue();

            if (firstKey == null) {
                firstKey = key.get();
                expectedIds[p] = converter.convert(0, value.toString()).getLabel();
            }

            size++;
        }

        keys.add(p);
        values.add(new Step0Output(firstKey, size));
    }

    Step0Output[] partitions = Step0Job.processOutput(keys, values);

    int[] actualIds = Step0Output.extractFirstIds(partitions);

    assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds),
            Arrays.equals(expectedIds, actualIds));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceBufferDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteBufferData" })
public void testReadBufferData() throws Exception {
    long reccnt = 0L;
    long tsize = 0L;
    byte[] buf;/*from  w w w. j a  v  a  2s . com*/
    Checksum cs = new CRC32();
    cs.reset();
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            m_partfns.add(listfiles[idx].getName());
        }
    }
    Collections.sort(m_partfns); // keep the order for checksum
    for (int idx = 0; idx < m_partfns.size(); ++idx) {
        System.out.println(String.format("Verifying : %s", m_partfns.get(idx)));
        FileSplit split = new FileSplit(new Path(m_workdir, m_partfns.get(idx)), 0, 0L, new String[0]);
        InputFormat<NullWritable, MneDurableInputValue<DurableBuffer<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableBuffer<?>>, DurableBuffer<?>>();
        RecordReader<NullWritable, MneDurableInputValue<DurableBuffer<?>>> reader = inputFormat
                .createRecordReader(split, m_tacontext);
        MneDurableInputValue<DurableBuffer<?>> dbufval = null;
        while (reader.nextKeyValue()) {
            dbufval = reader.getCurrentValue();
            assert dbufval.getValue().getSize() == dbufval.getValue().get().capacity();
            dbufval.getValue().get().clear();
            buf = new byte[dbufval.getValue().get().capacity()];
            dbufval.getValue().get().get(buf);
            cs.update(buf, 0, buf.length);
            tsize += dbufval.getValue().getSize();
            ++reccnt;
        }
        reader.close();
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_totalsize, tsize);
    AssertJUnit.assertEquals(m_checksum, cs.getValue());
    System.out.println(String.format("The checksum of buffer is %d", m_checksum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceChunkDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteChunkData" })
public void testReadChunkData() throws Exception {
    List<String> partfns = new ArrayList<String>();
    long reccnt = 0L;
    long tsize = 0L;
    Checksum cs = new CRC32();
    cs.reset();/*from  ww w  .  j  av  a2 s .co m*/
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            partfns.add(listfiles[idx].getName());
        }
    }
    Collections.sort(partfns); // keep the order for checksum
    for (int idx = 0; idx < partfns.size(); ++idx) {
        System.out.println(String.format("Verifying : %s", partfns.get(idx)));
        FileSplit split = new FileSplit(new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]);
        InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>();
        RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader = inputFormat
                .createRecordReader(split, m_tacontext);
        MneDurableInputValue<DurableChunk<?>> dchkval = null;
        while (reader.nextKeyValue()) {
            dchkval = reader.getCurrentValue();
            byte b;
            for (int j = 0; j < dchkval.getValue().getSize(); ++j) {
                b = unsafe.getByte(dchkval.getValue().get() + j);
                cs.update(b);
            }
            tsize += dchkval.getValue().getSize();
            ++reccnt;
        }
        reader.close();
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_totalsize, tsize);
    AssertJUnit.assertEquals(m_checksum, cs.getValue());
    System.out.println(String.format("The checksum of chunk is %d", m_checksum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreduceLongDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWriteLongData" })
public void testReadLongData() throws Exception {
    long sum = 0L;
    long reccnt = 0L;
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
            FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L,
                    new String[0]);
            InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat = new MneInputFormat<MneDurableInputValue<Long>, Long>();
            RecordReader<NullWritable, MneDurableInputValue<Long>> reader = inputFormat
                    .createRecordReader(split, m_tacontext);
            MneDurableInputValue<Long> mdval = null;
            while (reader.nextKeyValue()) {
                mdval = reader.getCurrentValue();
                sum += mdval.getValue();
                ++reccnt;/*  w  w  w  . j  av  a 2s. co  m*/
            }
            reader.close();
        }
    }
    AssertJUnit.assertEquals(m_sum, sum);
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    System.out.println(String.format("The checksum of long data is %d", sum));
}

From source file:org.apache.mnemonic.mapreduce.MneMapreducePersonDataTest.java

License:Apache License

@Test(enabled = true, dependsOnMethods = { "testWritePersonData" })
public void testReadPersonData() throws Exception {
    long sumage = 0L;
    long reccnt = 0L;
    File folder = new File(m_workdir.toString());
    File[] listfiles = folder.listFiles();
    for (int idx = 0; idx < listfiles.length; ++idx) {
        if (listfiles[idx].isFile()
                && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null))
                && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) {
            System.out.println(String.format("Verifying : %s", listfiles[idx].getName()));
            FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L,
                    new String[0]);
            InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat = new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>();
            RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader = inputFormat
                    .createRecordReader(split, m_tacontext);
            MneDurableInputValue<Person<Long>> personval = null;
            while (reader.nextKeyValue()) {
                personval = reader.getCurrentValue();
                AssertJUnit.assertTrue(personval.getValue().getAge() < 51);
                sumage += personval.getValue().getAge();
                ++reccnt;//  w w  w . jav  a  2  s.com
            }
            reader.close();
        }
    }
    AssertJUnit.assertEquals(m_reccnt, reccnt);
    AssertJUnit.assertEquals(m_sumage, sumage);
    System.out.println(String.format("The checksum of ages is %d", sumage));
}

From source file:org.apache.orc.mapreduce.TestMapreduceOrcOutputFormat.java

License:Apache License

@Test
public void testPredicatePushdown() throws Exception {
    TaskAttemptID id = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0);
    TaskAttemptContext attemptContext = new TaskAttemptContextImpl(conf, id);
    final String typeStr = "struct<i:int,s:string>";
    OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr);
    conf.set("mapreduce.output.fileoutputformat.outputdir", workDir.toString());
    conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000);
    conf.setBoolean(OrcOutputFormat.SKIP_TEMP_DIRECTORY, true);
    OutputFormat<NullWritable, OrcStruct> outputFormat = new OrcOutputFormat<OrcStruct>();
    RecordWriter<NullWritable, OrcStruct> writer = outputFormat.getRecordWriter(attemptContext);

    // write 4000 rows with the integer and the binary string
    TypeDescription type = TypeDescription.fromString(typeStr);
    OrcStruct row = (OrcStruct) OrcStruct.createValue(type);
    NullWritable nada = NullWritable.get();
    for (int r = 0; r < 4000; ++r) {
        row.setFieldValue(0, new IntWritable(r));
        row.setFieldValue(1, new Text(Integer.toBinaryString(r)));
        writer.write(nada, row);//w  w  w  .  ja v  a 2  s  .  c o m
    }
    writer.close(attemptContext);

    OrcInputFormat.setSearchArgument(conf,
            SearchArgumentFactory.newBuilder()
                    .between("i", PredicateLeaf.Type.LONG, new Long(1500), new Long(1999)).build(),
            new String[] { null, "i", "s" });
    FileSplit split = new FileSplit(new Path(workDir, "part-m-00000.orc"), 0, 1000000, new String[0]);
    RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().createRecordReader(split,
            attemptContext);
    // the sarg should cause it to skip over the rows except 1000 to 2000
    for (int r = 1000; r < 2000; ++r) {
        assertEquals(true, reader.nextKeyValue());
        row = reader.getCurrentValue();
        assertEquals(r, ((IntWritable) row.getFieldValue(0)).get());
        assertEquals(Integer.toBinaryString(r), row.getFieldValue(1).toString());
    }
    assertEquals(false, reader.nextKeyValue());
}

From source file:org.apache.orc.mapreduce.TestMapreduceOrcOutputFormat.java

License:Apache License

@Test
public void testColumnSelection() throws Exception {
    String typeStr = "struct<i:int,j:int,k:int>";
    OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr);
    conf.set("mapreduce.output.fileoutputformat.outputdir", workDir.toString());
    conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000);
    conf.setBoolean(OrcOutputFormat.SKIP_TEMP_DIRECTORY, true);
    TaskAttemptID id = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 1);
    TaskAttemptContext attemptContext = new TaskAttemptContextImpl(conf, id);
    OutputFormat<NullWritable, OrcStruct> outputFormat = new OrcOutputFormat<OrcStruct>();
    RecordWriter<NullWritable, OrcStruct> writer = outputFormat.getRecordWriter(attemptContext);

    // write 4000 rows with the integer and the binary string
    TypeDescription type = TypeDescription.fromString(typeStr);
    OrcStruct row = (OrcStruct) OrcStruct.createValue(type);
    NullWritable nada = NullWritable.get();
    for (int r = 0; r < 3000; ++r) {
        row.setFieldValue(0, new IntWritable(r));
        row.setFieldValue(1, new IntWritable(r * 2));
        row.setFieldValue(2, new IntWritable(r * 3));
        writer.write(nada, row);//w  ww.  j  av  a  2 s  .  com
    }
    writer.close(attemptContext);

    conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute(), "0,2");
    FileSplit split = new FileSplit(new Path(workDir, "part-m-00000.orc"), 0, 1000000, new String[0]);
    RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().createRecordReader(split,
            attemptContext);
    // the sarg should cause it to skip over the rows except 1000 to 2000
    for (int r = 0; r < 3000; ++r) {
        assertEquals(true, reader.nextKeyValue());
        row = reader.getCurrentValue();
        assertEquals(r, ((IntWritable) row.getFieldValue(0)).get());
        assertEquals(null, row.getFieldValue(1));
        assertEquals(r * 3, ((IntWritable) row.getFieldValue(2)).get());
    }
    assertEquals(false, reader.nextKeyValue());
}

From source file:org.apache.parquet.hadoop.thrift.TestParquetToThriftReadWriteAndProjection.java

License:Apache License

private <T extends TBase<?, ?>> void shouldDoProjection(Configuration conf, T recordToWrite,
        T exptectedReadResult, Class<? extends TBase<?, ?>> thriftClass) throws Exception {
    final Path parquetFile = new Path("target/test/TestParquetToThriftReadWriteAndProjection/file.parquet");
    final FileSystem fs = parquetFile.getFileSystem(conf);
    if (fs.exists(parquetFile)) {
        fs.delete(parquetFile, true);/*from w w w .  ja v a  2  s .  co m*/
    }

    //create a test file
    final TProtocolFactory protocolFactory = new TCompactProtocol.Factory();
    final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0);
    final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile,
            ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass);
    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos));

    recordToWrite.write(protocol);
    w.write(new BytesWritable(baos.toByteArray()));
    w.close();

    final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>();
    final Job job = new Job(conf, "read");
    job.setInputFormatClass(ParquetThriftInputFormat.class);
    ParquetThriftInputFormat.setInputPaths(job, parquetFile);
    final JobID jobID = new JobID("local", 1);
    List<InputSplit> splits = parquetThriftInputFormat
            .getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID));
    T readValue = null;
    for (InputSplit split : splits) {
        TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext(
                ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0));
        final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split,
                taskAttemptContext);
        reader.initialize(split, taskAttemptContext);
        if (reader.nextKeyValue()) {
            readValue = reader.getCurrentValue();
            LOG.info(readValue);
        }
    }
    assertEquals(exptectedReadResult, readValue);

}