List of usage examples for org.apache.hadoop.mapreduce RecordReader nextKeyValue
public abstract boolean nextKeyValue() throws IOException, InterruptedException;
From source file:co.cask.cdap.data.stream.StreamInputFormatTest.java
License:Apache License
@Test public void testFormatStreamRecordReader() throws IOException, InterruptedException { File inputDir = tmpFolder.newFolder(); File partition = new File(inputDir, "1.1000"); partition.mkdirs();/*from ww w .j ava 2s . c o m*/ File eventFile = new File(partition, "bucket.1.0." + StreamFileType.EVENT.getSuffix()); File indexFile = new File(partition, "bucket.1.0." + StreamFileType.INDEX.getSuffix()); // write 1 event StreamDataFileWriter writer = new StreamDataFileWriter(Files.newOutputStreamSupplier(eventFile), Files.newOutputStreamSupplier(indexFile), 100L); StreamEvent streamEvent = new StreamEvent(ImmutableMap.of("header1", "value1", "header2", "value2"), Charsets.UTF_8.encode("hello world"), 1000); writer.append(streamEvent); writer.close(); FormatSpecification formatSpec = new FormatSpecification(TextRecordFormat.class.getName(), Schema.recordOf("event", Schema.Field.of("body", Schema.of(Schema.Type.STRING))), Collections.<String, String>emptyMap()); Configuration conf = new Configuration(); StreamInputFormat.setBodyFormatSpecification(conf, formatSpec); StreamInputFormat.setStreamPath(conf, inputDir.toURI()); TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); StreamInputFormat format = new StreamInputFormat(); // read all splits and store the results in the list List<GenericStreamEventData<StructuredRecord>> recordsRead = Lists.newArrayList(); List<InputSplit> inputSplits = format.getSplits(context); for (InputSplit split : inputSplits) { RecordReader<LongWritable, GenericStreamEventData<StructuredRecord>> recordReader = format .createRecordReader(split, context); recordReader.initialize(split, context); while (recordReader.nextKeyValue()) { recordsRead.add(recordReader.getCurrentValue()); } } // should only have read 1 record Assert.assertEquals(1, recordsRead.size()); GenericStreamEventData<StructuredRecord> eventData = recordsRead.get(0); Assert.assertEquals(streamEvent.getHeaders(), eventData.getHeaders()); Assert.assertEquals("hello world", eventData.getBody().get("body")); }
From source file:co.cask.cdap.template.etl.common.ETLDBInputFormat.java
License:Apache License
@Override protected RecordReader createDBRecordReader(DBInputSplit split, Configuration conf) throws IOException { final RecordReader dbRecordReader = super.createDBRecordReader(split, conf); return new RecordReader() { @Override//from w ww. j av a 2 s . c o m public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { dbRecordReader.initialize(split, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return dbRecordReader.nextKeyValue(); } @Override public Object getCurrentKey() throws IOException, InterruptedException { return dbRecordReader.getCurrentKey(); } @Override public Object getCurrentValue() throws IOException, InterruptedException { return dbRecordReader.getCurrentValue(); } @Override public float getProgress() throws IOException, InterruptedException { return dbRecordReader.getProgress(); } @Override public void close() throws IOException { dbRecordReader.close(); try { DriverManager.deregisterDriver(driverShim); } catch (SQLException e) { throw new IOException(e); } } }; }
From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java
License:Apache License
@Test public void testOneFile() throws IOException, InterruptedException { Path dir = new Path(tempFolder.getRoot().getAbsolutePath()); CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>(); Path inputFile = new Path(dir, "file1.txt"); writeSequenceFile(inputFile);//www. ja va 2 s.com Job job = new Job(new JobConf()); FileInputFormat.addInputPath(job, inputFile); List<InputSplit> splits = inputFormat.getSplits(job); assertEquals(1, splits.size()); TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0); Configuration conf1 = new Configuration(); TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId); RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1); rr.initialize(splits.get(0), context1); assertTrue(rr.nextKeyValue()); assertEquals(key, rr.getCurrentKey()); assertEquals(value, rr.getCurrentValue()); assertFalse(rr.nextKeyValue()); assertEquals(1.0f, rr.getProgress(), 0.1); }
From source file:com.alexholmes.hadooputils.combine.seqfile.mapreduce.CombineSequenceFileTest.java
License:Apache License
@Test public void testTwoFiles() throws IOException, InterruptedException { Path dir = new Path(tempFolder.getRoot().getAbsolutePath()); CombineSequenceFileInputFormat<Text, Text> inputFormat = new CombineSequenceFileInputFormat<Text, Text>(); Path inputFile1 = new Path(dir, "file1.txt"); Path inputFile2 = new Path(dir, "file2.txt"); writeSequenceFile(inputFile1);// w w w .j a v a 2 s .co m writeSequenceFile(inputFile2); Job job = new Job(new JobConf()); FileInputFormat.addInputPath(job, inputFile1); FileInputFormat.addInputPath(job, inputFile2); List<InputSplit> splits = inputFormat.getSplits(job); assertEquals(1, splits.size()); TaskAttemptID taskId = new TaskAttemptID("jt", 0, true, 0, 0); Configuration conf1 = new Configuration(); TaskAttemptContext context1 = new TaskAttemptContext(conf1, taskId); RecordReader<Text, Text> rr = inputFormat.createRecordReader(splits.get(0), context1); rr.initialize(splits.get(0), context1); assertTrue(rr.nextKeyValue()); assertEquals(key, rr.getCurrentKey()); assertEquals(value, rr.getCurrentValue()); assertEquals(0.5f, rr.getProgress(), 0.1); assertTrue(rr.nextKeyValue()); assertEquals(key, rr.getCurrentKey()); assertEquals(value, rr.getCurrentValue()); assertFalse(rr.nextKeyValue()); assertEquals(1.0f, rr.getProgress(), 0.1); }
From source file:com.cloudera.recordservice.examples.terasort.TeraInputFormat.java
License:Apache License
/** * Use the input splits to take samples of the input and generate sample * keys. By default reads 100,000 keys from 10 locations in the input, sorts * them and picks N-1 keys to generate N equally sized partitions. * @param job the job to sample// w w w .ja va 2 s. com * @param partFile where to write the output file to * @throws Throwable if something goes wrong */ public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable { long t1 = System.currentTimeMillis(); Configuration conf = job.getConfiguration(); final TeraInputFormat inFormat = new TeraInputFormat(); final TextSampler sampler = new TextSampler(); int partitions = job.getNumReduceTasks(); long sampleSize = conf.getLong(SAMPLE_SIZE, 100000); final List<InputSplit> splits = inFormat.getSplits(job); long t2 = System.currentTimeMillis(); System.out.println("Computing input splits took " + (t2 - t1) + "ms"); int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size()); System.out.println("Sampling " + samples + " splits of " + splits.size()); final long recordsPerSample = sampleSize / samples; final int sampleStep = splits.size() / samples; Thread[] samplerReader = new Thread[samples]; SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group"); // take N samples from different parts of the input for (int i = 0; i < samples; ++i) { final int idx = i; samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) { { setDaemon(true); } @Override public void run() { long records = 0; try { TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID()); RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx), context); reader.initialize(splits.get(sampleStep * idx), context); while (reader.nextKeyValue()) { sampler.addKey(new Text(reader.getCurrentKey())); records += 1; if (recordsPerSample <= records) { break; } } } catch (IOException ie) { System.err.println( "Got an exception while reading splits " + StringUtils.stringifyException(ie)); throw new RuntimeException(ie); } catch (InterruptedException e) { } } }; samplerReader[i].start(); } FileSystem outFs = partFile.getFileSystem(conf); DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10, outFs.getDefaultBlockSize(partFile)); for (int i = 0; i < samples; i++) { try { samplerReader[i].join(); if (threadGroup.getThrowable() != null) { throw threadGroup.getThrowable(); } } catch (InterruptedException e) { } } for (Text split : sampler.createPartitions(partitions)) { split.write(writer); } writer.close(); long t3 = System.currentTimeMillis(); System.out.println("Computing parititions took " + (t3 - t2) + "ms"); }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.input.HCatTupleInputFormat.java
License:Apache License
@Override public RecordReader<ITuple, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext taskContext) throws IOException, InterruptedException { HCatInputFormat iF = new HCatInputFormat(); @SuppressWarnings("rawtypes") final RecordReader<WritableComparable, HCatRecord> hCatRecordReader = iF.createRecordReader(split, taskContext);//from ww w. ja v a2 s . c o m return new RecordReader<ITuple, NullWritable>() { ITuple tuple = new Tuple(pangoolSchema); @Override public void close() throws IOException { hCatRecordReader.close(); } @Override public ITuple getCurrentKey() throws IOException, InterruptedException { HCatRecord record = hCatRecordReader.getCurrentValue(); // Perform conversion between HCatRecord and Tuple for (int pos = 0; pos < schema.size(); pos++) { tuple.set(pos, record.get(pos)); } return tuple; } @Override public NullWritable getCurrentValue() throws IOException, InterruptedException { return NullWritable.get(); } @Override public float getProgress() throws IOException, InterruptedException { return hCatRecordReader.getProgress(); } @Override public void initialize(InputSplit iS, TaskAttemptContext context) throws IOException, InterruptedException { hCatRecordReader.initialize(iS, context); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { return hCatRecordReader.nextKeyValue(); } }; }
From source file:com.datasalt.pangool.tuplemr.mapred.lib.output.TestTupleInputOutputFormat.java
License:Apache License
public void testSplits(long maxSplitSize, int generatedRows) throws IOException, InterruptedException, IllegalArgumentException, SecurityException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException { logger.info("Testing maxSplitSize: " + maxSplitSize + " and generatedRows:" + generatedRows); FileSystem fS = FileSystem.get(getConf()); Random r = new Random(1); Schema schema = new Schema("schema", Fields.parse("i:int,s:string")); ITuple tuple = new Tuple(schema); Path outPath = new Path(OUT); TupleFile.Writer writer = new TupleFile.Writer(FileSystem.get(getConf()), getConf(), outPath, schema); for (int i = 0; i < generatedRows; i++) { tuple.set("i", r.nextInt()); tuple.set("s", r.nextLong() + ""); writer.append(tuple);/* w ww . j a v a2s . com*/ } writer.close(); TupleInputFormat format = ReflectionUtils.newInstance(TupleInputFormat.class, getConf()); Job job = new Job(getConf()); FileInputFormat.setInputPaths(job, outPath); logger.info("Using max input split size: " + maxSplitSize); FileInputFormat.setMaxInputSplitSize(job, maxSplitSize); job.setInputFormatClass(FileInputFormat.class); // Read all the splits and count. The number of read rows must // be the same than the written ones. int count = 0; for (InputSplit split : format.getSplits(job)) { TaskAttemptID attemptId = new TaskAttemptID(new TaskID(), 1); TaskAttemptContext attemptContext = TaskAttemptContextFactory.get(getConf(), attemptId); logger.info("Sampling split: " + split); RecordReader<ITuple, NullWritable> reader = format.createRecordReader(split, attemptContext); reader.initialize(split, attemptContext); while (reader.nextKeyValue()) { tuple = reader.getCurrentKey(); count++; } reader.close(); } assertEquals(generatedRows, count); HadoopUtils.deleteIfExists(fS, outPath); }
From source file:com.facebook.hiveio.benchmark.InputBenchmark.java
License:Apache License
/** * Read all records from a RecordReader//from w ww.j a va 2 s. co m * * @param reader RecordReader * @return number of rows * @throws IOException I/O errors * @throws InterruptedException thread errors */ private static long readFully(RecordReader<WritableComparable, HiveReadableRecord> reader) throws IOException, InterruptedException { long num = 0; while (reader.nextKeyValue()) { HiveReadableRecord record = reader.getCurrentValue(); parseLongLongDouble(record); ++num; } return num; }
From source file:com.facebook.hiveio.tailer.TailerCmd.java
License:Apache License
/** * Read input split/*w ww.j a v a2s . c om*/ * * @param split InputSplit * @param context Context * @throws IOException * @throws InterruptedException */ private void readSplit(InputSplit split, Context context) throws IOException, InterruptedException { TaskAttemptID taskId = new TaskAttemptID(); TaskAttemptContext taskContext = new TaskAttemptContext(context.hiveConf, taskId); RecordReader<WritableComparable, HiveReadableRecord> recordReader; recordReader = context.hiveApiInputFormat.createRecordReader(split, taskContext); recordReader.initialize(split, taskContext); int rowsParsed = 0; while (recordReader.nextKeyValue() && !context.limitReached(args.limit)) { HiveReadableRecord record = recordReader.getCurrentValue(); if (args.parser.parseOnly) { rowParser.parse(record); } else { recordPrinter.printRecord(record, context.schema.numColumns(), context, args); } ++rowsParsed; if (context.rowsParsed.incrementAndGet() >= args.limit) { break; } if (rowsParsed % args.metricsOpts.updateRows == 0) { context.stats.addRows(args.metricsOpts.updateRows); rowsParsed = 0; } } context.stats.addRows(rowsParsed); }
From source file:com.hadoop.mapreduce.TestLzoTextInputFormat.java
License:Open Source License
/** * Generate random data, compress it, index and md5 hash the data. * Then read it all back and md5 that too, to verify that it all went ok. * //from w w w. ja v a2s .co m * @param testWithIndex Should we index or not? * @param charsToOutput How many characters of random data should we output. * @throws IOException * @throws NoSuchAlgorithmException * @throws InterruptedException */ private void runTest(boolean testWithIndex, int charsToOutput) throws IOException, NoSuchAlgorithmException, InterruptedException { if (!GPLNativeCodeLoader.isNativeCodeLoaded()) { LOG.warn("Cannot run this test without the native lzo libraries"); return; } Configuration conf = new Configuration(); conf.setLong("fs.local.block.size", charsToOutput / 2); // reducing block size to force a split of the tiny file conf.set("io.compression.codecs", LzopCodec.class.getName()); FileSystem localFs = FileSystem.getLocal(conf); localFs.delete(outputDir, true); localFs.mkdirs(outputDir); Job job = new Job(conf); TextOutputFormat.setCompressOutput(job, true); TextOutputFormat.setOutputCompressorClass(job, LzopCodec.class); TextOutputFormat.setOutputPath(job, outputDir); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID("123", 0, TaskType.REDUCE, 1, 2)); // create some input data byte[] expectedMd5 = createTestInput(outputDir, localFs, attemptContext, charsToOutput); if (testWithIndex) { Path lzoFile = new Path(outputDir, lzoFileName); LzoTextInputFormat.createIndex(localFs, lzoFile); } LzoTextInputFormat inputFormat = new LzoTextInputFormat(); TextInputFormat.setInputPaths(job, outputDir); List<InputSplit> is = inputFormat.getSplits(job); //verify we have the right number of lzo chunks if (testWithIndex && OUTPUT_BIG == charsToOutput) { assertEquals(3, is.size()); } else { assertEquals(1, is.size()); } // let's read it all and calculate the md5 hash for (InputSplit inputSplit : is) { RecordReader<LongWritable, Text> rr = inputFormat.createRecordReader(inputSplit, attemptContext); rr.initialize(inputSplit, attemptContext); while (rr.nextKeyValue()) { Text value = rr.getCurrentValue(); md5.update(value.getBytes(), 0, value.getLength()); } rr.close(); } localFs.close(); assertTrue(Arrays.equals(expectedMd5, md5.digest())); }