Example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue

List of usage examples for org.apache.hadoop.mapreduce RecordReader nextKeyValue

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue.

Prototype

public abstract boolean nextKeyValue() throws IOException, InterruptedException;

Source Link

Document

Read the next key, value pair.

Usage

From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java

License:Apache License

@Test
public void testFormatStreamRecordReader() throws IOException, InterruptedException {
    File inputDir = tmpFolder.newFolder();
    File partition = new File(inputDir, "1.1000");
    partition.mkdirs();/*from  ww  w  .j  ava 2s .  c o m*/
    File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix());
    File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix());

    // write 1 event
    StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile),
            Files.newOutputStreamSupplier(indexFile), 100L);

    StreamEvent streamEvent = new StreamEvent(ImmutableMap.of("header1", "value1", "header2", "value2"),
            Charsets.UTF_8.encode("hello world"), 1000);
    writer.append(streamEvent);
    writer.close();

    FormatSpecification formatSpec = new FormatSpecification(TextRecordFormat.class.getName(),
            Schema.recordOf("event", Schema.Field.of("body", Schema.of(Schema.Type.STRING))),
            Collections.<String, String>emptyMap());
    Configuration conf = new Configuration();
    StreamInputFormat.setBodyFormatSpecification(conf, formatSpec);
    StreamInputFormat.setStreamPath(conf, inputDir.toURI());
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());

    StreamInputFormat format = new StreamInputFormat();

    // read all splits and store the results in the list
    List<GenericStreamEventData<StructuredRecord>> recordsRead = Lists.newArrayList();
    List<InputSplit> inputSplits = format.getSplits(context);
    for (InputSplit split : inputSplits) {
        RecordReader<LongWritable, GenericStreamEventData<StructuredRecord>> recordReader = format
                .createRecordReader(split, context);
        recordReader.initialize(split, context);
        while (recordReader.nextKeyValue()) {
            recordsRead.add(recordReader.getCurrentValue());
        }
    }

    // should only have read 1 record
    Assert.assertEquals(1, recordsRead.size());
    GenericStreamEventData<StructuredRecord> eventData = recordsRead.get(0);
    Assert.assertEquals(streamEvent.getHeaders(), eventData.getHeaders());
    Assert.assertEquals("hello world", eventData.getBody().get("body"));
}

From source file:co.cask.cdap.template.etl.common.ETLDBInputFormat.java

License:Apache License

@Override
protected RecordReader createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException {
    final RecordReader dbRecordReader = super.createDBRecordReader(split, conf);
    return new RecordReader() {
        @Override//from   w  ww.  j  av  a 2  s  .  c  o m
        public void initialize(InputSplit split, TaskAttemptContext context)
                throws IOException, InterruptedException {
            dbRecordReader.initialize(split, context);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return dbRecordReader.nextKeyValue();
        }

        @Override
        public Object getCurrentKey() throws IOException, InterruptedException {
            return dbRecordReader.getCurrentKey();
        }

        @Override
        public Object getCurrentValue() throws IOException, InterruptedException {
            return dbRecordReader.getCurrentValue();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return dbRecordReader.getProgress();
        }

        @Override
        public void close() throws IOException {
            dbRecordReader.close();
            try {
                DriverManager.deregisterDriver(driverShim);
            } catch (SQLException e) {
                throw new IOException(e);
            }
        }
    };
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java

License:Apache License

@Test
public void testOneFile() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile = new Path(dir, "file1.txt");

    writeSequenceFile(inputFile);//www.  ja va 2  s.com

    Job job = new Job(new JobConf());

    FileInputFormat.addInputPath(job, inputFile);

    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size());

    TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0);
    Configuration conf1 = new Configuration();
    TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId);

    RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1);
    rr.initialize(splits.get(0), context1);
    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertFalse(rr.nextKeyValue());
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java

License:Apache License

@Test
public void testTwoFiles() throws IOException, InterruptedException {
    Path dir = new Path(tempFolder.getRoot().getAbsolutePath());

    CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>();
    Path inputFile1 = new Path(dir, "file1.txt");
    Path inputFile2 = new Path(dir, "file2.txt");

    writeSequenceFile(inputFile1);// w  w  w  .j  a  v a 2  s  .co m
    writeSequenceFile(inputFile2);

    Job job = new Job(new JobConf());

    FileInputFormat.addInputPath(job, inputFile1);
    FileInputFormat.addInputPath(job, inputFile2);

    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size());

    TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0);
    Configuration conf1 = new Configuration();
    TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId);

    RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1);
    rr.initialize(splits.get(0), context1);
    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertEquals(0.5f, rr.getProgress(), 0.1);

    assertTrue(rr.nextKeyValue());

    assertEquals(key, rr.getCurrentKey());
    assertEquals(value, rr.getCurrentValue());

    assertFalse(rr.nextKeyValue());
    assertEquals(1.0f, rr.getProgress(), 0.1);
}

From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param job the job to sample// w  w w  .ja  va  2 s.  com
 * @param partFile where to write the output file to
 * @throws Throwable if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final TeraInputFormat inFormat = new TeraInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            @Override
            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.HCatTupleInputFormat.java

License:Apache License

@Override
public RecordReader<ITuple, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext taskContext)
        throws IOException, InterruptedException {

    HCatInputFormat iF = new HCatInputFormat();

    @SuppressWarnings("rawtypes")
    final RecordReader<WritableComparable, HCatRecord> hCatRecordReader = iF.createRecordReader(split,
            taskContext);//from  ww w.  ja v a2 s  .  c o  m

    return new RecordReader<ITuple, NullWritable>() {

        ITuple tuple = new Tuple(pangoolSchema);

        @Override
        public void close() throws IOException {
            hCatRecordReader.close();
        }

        @Override
        public ITuple getCurrentKey() throws IOException, InterruptedException {
            HCatRecord record = hCatRecordReader.getCurrentValue();
            // Perform conversion between HCatRecord and Tuple
            for (int pos = 0; pos < schema.size(); pos++) {
                tuple.set(pos, record.get(pos));
            }
            return tuple;
        }

        @Override
        public NullWritable getCurrentValue() throws IOException, InterruptedException {
            return NullWritable.get();
        }

        @Override
        public float getProgress() throws IOException, InterruptedException {
            return hCatRecordReader.getProgress();
        }

        @Override
        public void initialize(InputSplit iS, TaskAttemptContext context)
                throws IOException, InterruptedException {
            hCatRecordReader.initialize(iS, context);
        }

        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            return hCatRecordReader.nextKeyValue();
        }
    };
}

From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java

License:Apache License

public void testSplits(long maxSplitSize, int generatedRows) throws IOException, InterruptedException,
        IllegalArgumentException, SecurityException, ClassNotFoundException, InstantiationException,
        IllegalAccessException, InvocationTargetException, NoSuchMethodException {
    logger.info("Testing maxSplitSize: " + maxSplitSize + " and generatedRows:" + generatedRows);
    FileSystem fS = FileSystem.get(getConf());
    Random r = new Random(1);
    Schema schema = new Schema("schema", Fields.parse("i:int,s:string"));
    ITuple tuple = new Tuple(schema);

    Path outPath = new Path(OUT);
    TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(getConf()), getConf(), outPath, schema);
    for (int i = 0; i < generatedRows; i++) {
        tuple.set("i", r.nextInt());
        tuple.set("s", r.nextLong() + "");
        writer.append(tuple);/* w ww  .  j a v a2s . com*/
    }
    writer.close();

    TupleInputFormat format = ReflectionUtils.newInstance(TupleInputFormat.class, getConf());
    Job job = new Job(getConf());
    FileInputFormat.setInputPaths(job, outPath);
    logger.info("Using max input split size: " + maxSplitSize);
    FileInputFormat.setMaxInputSplitSize(job, maxSplitSize);
    job.setInputFormatClass(FileInputFormat.class);

    // Read all the splits and count. The number of read rows must
    // be the same than the written ones.
    int count = 0;
    for (InputSplit split : format.getSplits(job)) {
        TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1);
        TaskAttemptContext attemptContext = TaskAttemptContextFactory.get(getConf(), attemptId);
        logger.info("Sampling split: " + split);
        RecordReader<ITuple, NullWritable> reader = format.createRecordReader(split, attemptContext);
        reader.initialize(split, attemptContext);
        while (reader.nextKeyValue()) {
            tuple = reader.getCurrentKey();
            count++;
        }
        reader.close();
    }

    assertEquals(generatedRows, count);

    HadoopUtils.deleteIfExists(fS, outPath);
}

From source file:com.facebook.hiveio.benchmark.InputBenchmark.java

License:Apache License

/**
 * Read all records from a RecordReader//from w ww.j a va  2  s.  co m
 *
 * @param reader RecordReader
 * @return number of rows
 * @throws IOException I/O errors
 * @throws InterruptedException thread errors
 */
private static long readFully(RecordReader<WritableComparable, HiveReadableRecord> reader)
        throws IOException, InterruptedException {
    long num = 0;
    while (reader.nextKeyValue()) {
        HiveReadableRecord record = reader.getCurrentValue();
        parseLongLongDouble(record);
        ++num;
    }
    return num;
}

From source file:com.facebook.hiveio.tailer.TailerCmd.java

License:Apache License

/**
 * Read input split/*w  ww.j  a  v a2s  .  c  om*/
 *
 * @param split InputSplit
 * @param context Context
 * @throws IOException
 * @throws InterruptedException
 */
private void readSplit(InputSplit split, Context context) throws IOException, InterruptedException {
    TaskAttemptID taskId = new TaskAttemptID();
    TaskAttemptContext taskContext = new TaskAttemptContext(context.hiveConf, taskId);
    RecordReader<WritableComparable, HiveReadableRecord> recordReader;
    recordReader = context.hiveApiInputFormat.createRecordReader(split, taskContext);
    recordReader.initialize(split, taskContext);

    int rowsParsed = 0;
    while (recordReader.nextKeyValue() && !context.limitReached(args.limit)) {
        HiveReadableRecord record = recordReader.getCurrentValue();
        if (args.parser.parseOnly) {
            rowParser.parse(record);
        } else {
            recordPrinter.printRecord(record, context.schema.numColumns(), context, args);
        }
        ++rowsParsed;
        if (context.rowsParsed.incrementAndGet() >= args.limit) {
            break;
        }
        if (rowsParsed % args.metricsOpts.updateRows == 0) {
            context.stats.addRows(args.metricsOpts.updateRows);
            rowsParsed = 0;
        }
    }
    context.stats.addRows(rowsParsed);
}

From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java

License:Open Source License

/**
 * Generate random data, compress it, index and md5 hash the data.
 * Then read it all back and md5 that too, to verify that it all went ok.
 * //from   w  w w. ja v  a2s .co  m
 * @param testWithIndex Should we index or not?
 * @param charsToOutput How many characters of random data should we output.
 * @throws IOException
 * @throws NoSuchAlgorithmException
 * @throws InterruptedException
 */
private void runTest(boolean testWithIndex, int charsToOutput)
        throws IOException, NoSuchAlgorithmException, InterruptedException {

    if (!GPLNativeCodeLoader.isNativeCodeLoaded()) {
        LOG.warn("Cannot run this test without the native lzo libraries");
        return;
    }

    Configuration conf = new Configuration();
    conf.setLong("fs.local.block.size", charsToOutput / 2);
    // reducing block size to force a split of the tiny file
    conf.set("io.compression.codecs", LzopCodec.class.getName());

    FileSystem localFs = FileSystem.getLocal(conf);
    localFs.delete(outputDir, true);
    localFs.mkdirs(outputDir);

    Job job = new Job(conf);
    TextOutputFormat.setCompressOutput(job, true);
    TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class);
    TextOutputFormat.setOutputPath(job, outputDir);

    TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(),
            new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2));

    // create some input data
    byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput);

    if (testWithIndex) {
        Path lzoFile = new Path(outputDir, lzoFileName);
        LzoTextInputFormat.createIndex(localFs, lzoFile);
    }

    LzoTextInputFormat inputFormat = new LzoTextInputFormat();
    TextInputFormat.setInputPaths(job, outputDir);

    List<InputSplit> is = inputFormat.getSplits(job);
    //verify we have the right number of lzo chunks
    if (testWithIndex && OUTPUT_BIG == charsToOutput) {
        assertEquals(3, is.size());
    } else {
        assertEquals(1, is.size());
    }

    // let's read it all and calculate the md5 hash
    for (InputSplit inputSplit : is) {
        RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext);
        rr.initialize(inputSplit, attemptContext);

        while (rr.nextKeyValue()) {
            Text value = rr.getCurrentValue();

            md5.update(value.getBytes(), 0, value.getLength());
        }

        rr.close();
    }

    localFs.close();
    assertTrue(Arrays.equals(expectedMd5, md5.digest()));
}