List of usage examples for org.apache.hadoop.mapreduce RecordReader nextKeyValue
public abstract boolean nextKeyValue() throws IOException, InterruptedException;
From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
/** * The second step uses the trees to predict the rest of the instances outside * their own partition/*from w w w . j a v a 2s . co m*/ */ protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback) throws IOException, InterruptedException { JobContext jobContext = new JobContext(conf, new JobID()); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(jobContext); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted); Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees // compute the expected number of outputs int total = 0; for (int p = 0; p < nbSplits; p++) { total += Step2Mapper.nbConcerned(nbSplits, numTrees, p); } TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees); long slowest = 0; // duration of slowest map for (int partition = 0; partition < nbSplits; partition++) { InputSplit split = sorted[partition]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); // load the output of the 1st step int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition); TreeID[] fsKeys = new TreeID[nbConcerned]; Node[] fsTrees = new Node[nbConcerned]; FileSystem fs = forestPath.getFileSystem(conf); int numInstances = InterResults.load(fs, forestPath, nbSplits, numTrees, partition, fsKeys, fsTrees); Step2Mapper mapper = new Step2Mapper(); mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances); long time = System.currentTimeMillis(); while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), secondOutput); } mapper.cleanup(secondOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted);//from w ww . j a v a 2 s . c o m Builder.sortSplits(sorted); Step0Context context = new Step0Context(new Step0Mapper(), job.getConfiguration(), new TaskAttemptID(), NUM_MAPS); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); if (firstKey == null) { firstKey = key.get(); } mapper.map(key, reader.getCurrentValue(), context); size++; } mapper.cleanup(context); // validate the mapper's output assertEquals(p, context.keys[p]); assertEquals(firstKey.longValue(), context.values[p].getFirstId()); assertEquals(size, context.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < NUM_INSTANCES; index++) { source[index][labelId] = index;// w w w .j a v a 2s . c o m } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted); Builder.sortSplits(sorted); List<Integer> keys = new ArrayList<Integer>(); List<Step0Output> values = new ArrayList<Step0Output>(); int[] expectedIds = new int[NUM_MAPS]; TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); Text value = reader.getCurrentValue(); if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).getLabel(); } size++; } keys.add(p); values.add(new Step0Output(firstKey, size)); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.apache.mnemonic.mapreduce.MneMapreduceBufferDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWriteBufferData" }) public void testReadBufferData() throws Exception { long reccnt = 0L; long tsize = 0L; byte[] buf;/*from w w w. j a v a 2s . com*/ Checksum cs = new CRC32(); cs.reset(); File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { m_partfns.add(listfiles[idx].getName()); } } Collections.sort(m_partfns); // keep the order for checksum for (int idx = 0; idx < m_partfns.size(); ++idx) { System.out.println(String.format("Verifying : %s", m_partfns.get(idx))); FileSplit split = new FileSplit(new Path(m_workdir, m_partfns.get(idx)), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<DurableBuffer<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableBuffer<?>>, DurableBuffer<?>>(); RecordReader<NullWritable, MneDurableInputValue<DurableBuffer<?>>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<DurableBuffer<?>> dbufval = null; while (reader.nextKeyValue()) { dbufval = reader.getCurrentValue(); assert dbufval.getValue().getSize() == dbufval.getValue().get().capacity(); dbufval.getValue().get().clear(); buf = new byte[dbufval.getValue().get().capacity()]; dbufval.getValue().get().get(buf); cs.update(buf, 0, buf.length); tsize += dbufval.getValue().getSize(); ++reccnt; } reader.close(); } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_totalsize, tsize); AssertJUnit.assertEquals(m_checksum, cs.getValue()); System.out.println(String.format("The checksum of buffer is %d", m_checksum)); }
From source file:org.apache.mnemonic.mapreduce.MneMapreduceChunkDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWriteChunkData" }) public void testReadChunkData() throws Exception { List<String> partfns = new ArrayList<String>(); long reccnt = 0L; long tsize = 0L; Checksum cs = new CRC32(); cs.reset();/*from ww w . j av a2 s .co m*/ File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { partfns.add(listfiles[idx].getName()); } } Collections.sort(partfns); // keep the order for checksum for (int idx = 0; idx < partfns.size(); ++idx) { System.out.println(String.format("Verifying : %s", partfns.get(idx))); FileSplit split = new FileSplit(new Path(m_workdir, partfns.get(idx)), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<DurableChunk<?>>> inputFormat = new MneInputFormat<MneDurableInputValue<DurableChunk<?>>, DurableChunk<?>>(); RecordReader<NullWritable, MneDurableInputValue<DurableChunk<?>>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<DurableChunk<?>> dchkval = null; while (reader.nextKeyValue()) { dchkval = reader.getCurrentValue(); byte b; for (int j = 0; j < dchkval.getValue().getSize(); ++j) { b = unsafe.getByte(dchkval.getValue().get() + j); cs.update(b); } tsize += dchkval.getValue().getSize(); ++reccnt; } reader.close(); } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_totalsize, tsize); AssertJUnit.assertEquals(m_checksum, cs.getValue()); System.out.println(String.format("The checksum of chunk is %d", m_checksum)); }
From source file:org.apache.mnemonic.mapreduce.MneMapreduceLongDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWriteLongData" }) public void testReadLongData() throws Exception { long sum = 0L; long reccnt = 0L; File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { System.out.println(String.format("Verifying : %s", listfiles[idx].getName())); FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<Long>> inputFormat = new MneInputFormat<MneDurableInputValue<Long>, Long>(); RecordReader<NullWritable, MneDurableInputValue<Long>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<Long> mdval = null; while (reader.nextKeyValue()) { mdval = reader.getCurrentValue(); sum += mdval.getValue(); ++reccnt;/* w w w . j av a 2s. co m*/ } reader.close(); } } AssertJUnit.assertEquals(m_sum, sum); AssertJUnit.assertEquals(m_reccnt, reccnt); System.out.println(String.format("The checksum of long data is %d", sum)); }
From source file:org.apache.mnemonic.mapreduce.MneMapreducePersonDataTest.java
License:Apache License
@Test(enabled = true, dependsOnMethods = { "testWritePersonData" }) public void testReadPersonData() throws Exception { long sumage = 0L; long reccnt = 0L; File folder = new File(m_workdir.toString()); File[] listfiles = folder.listFiles(); for (int idx = 0; idx < listfiles.length; ++idx) { if (listfiles[idx].isFile() && listfiles[idx].getName().startsWith(MneConfigHelper.getBaseOutputName(m_conf, null)) && listfiles[idx].getName().endsWith(MneConfigHelper.DEFAULT_FILE_EXTENSION)) { System.out.println(String.format("Verifying : %s", listfiles[idx].getName())); FileSplit split = new FileSplit(new Path(m_workdir, listfiles[idx].getName()), 0, 0L, new String[0]); InputFormat<NullWritable, MneDurableInputValue<Person<Long>>> inputFormat = new MneInputFormat<MneDurableInputValue<Person<Long>>, Person<Long>>(); RecordReader<NullWritable, MneDurableInputValue<Person<Long>>> reader = inputFormat .createRecordReader(split, m_tacontext); MneDurableInputValue<Person<Long>> personval = null; while (reader.nextKeyValue()) { personval = reader.getCurrentValue(); AssertJUnit.assertTrue(personval.getValue().getAge() < 51); sumage += personval.getValue().getAge(); ++reccnt;// w w w . jav a 2 s.com } reader.close(); } } AssertJUnit.assertEquals(m_reccnt, reccnt); AssertJUnit.assertEquals(m_sumage, sumage); System.out.println(String.format("The checksum of ages is %d", sumage)); }
From source file:org.apache.orc.mapreduce.TestMapreduceOrcOutputFormat.java
License:Apache License
@Test public void testPredicatePushdown() throws Exception { TaskAttemptID id = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 0); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(conf, id); final String typeStr = "struct<i:int,s:string>"; OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr); conf.set("mapreduce.output.fileoutputformat.outputdir", workDir.toString()); conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000); conf.setBoolean(OrcOutputFormat.SKIP_TEMP_DIRECTORY, true); OutputFormat<NullWritable, OrcStruct> outputFormat = new OrcOutputFormat<OrcStruct>(); RecordWriter<NullWritable, OrcStruct> writer = outputFormat.getRecordWriter(attemptContext); // write 4000 rows with the integer and the binary string TypeDescription type = TypeDescription.fromString(typeStr); OrcStruct row = (OrcStruct) OrcStruct.createValue(type); NullWritable nada = NullWritable.get(); for (int r = 0; r < 4000; ++r) { row.setFieldValue(0, new IntWritable(r)); row.setFieldValue(1, new Text(Integer.toBinaryString(r))); writer.write(nada, row);//w w w . ja v a 2 s . c o m } writer.close(attemptContext); OrcInputFormat.setSearchArgument(conf, SearchArgumentFactory.newBuilder() .between("i", PredicateLeaf.Type.LONG, new Long(1500), new Long(1999)).build(), new String[] { null, "i", "s" }); FileSplit split = new FileSplit(new Path(workDir, "part-m-00000.orc"), 0, 1000000, new String[0]); RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().createRecordReader(split, attemptContext); // the sarg should cause it to skip over the rows except 1000 to 2000 for (int r = 1000; r < 2000; ++r) { assertEquals(true, reader.nextKeyValue()); row = reader.getCurrentValue(); assertEquals(r, ((IntWritable) row.getFieldValue(0)).get()); assertEquals(Integer.toBinaryString(r), row.getFieldValue(1).toString()); } assertEquals(false, reader.nextKeyValue()); }
From source file:org.apache.orc.mapreduce.TestMapreduceOrcOutputFormat.java
License:Apache License
@Test public void testColumnSelection() throws Exception { String typeStr = "struct<i:int,j:int,k:int>"; OrcConf.MAPRED_OUTPUT_SCHEMA.setString(conf, typeStr); conf.set("mapreduce.output.fileoutputformat.outputdir", workDir.toString()); conf.setInt(OrcConf.ROW_INDEX_STRIDE.getAttribute(), 1000); conf.setBoolean(OrcOutputFormat.SKIP_TEMP_DIRECTORY, true); TaskAttemptID id = new TaskAttemptID("jt", 0, TaskType.MAP, 0, 1); TaskAttemptContext attemptContext = new TaskAttemptContextImpl(conf, id); OutputFormat<NullWritable, OrcStruct> outputFormat = new OrcOutputFormat<OrcStruct>(); RecordWriter<NullWritable, OrcStruct> writer = outputFormat.getRecordWriter(attemptContext); // write 4000 rows with the integer and the binary string TypeDescription type = TypeDescription.fromString(typeStr); OrcStruct row = (OrcStruct) OrcStruct.createValue(type); NullWritable nada = NullWritable.get(); for (int r = 0; r < 3000; ++r) { row.setFieldValue(0, new IntWritable(r)); row.setFieldValue(1, new IntWritable(r * 2)); row.setFieldValue(2, new IntWritable(r * 3)); writer.write(nada, row);//w ww. j av a 2 s . com } writer.close(attemptContext); conf.set(OrcConf.INCLUDE_COLUMNS.getAttribute(), "0,2"); FileSplit split = new FileSplit(new Path(workDir, "part-m-00000.orc"), 0, 1000000, new String[0]); RecordReader<NullWritable, OrcStruct> reader = new OrcInputFormat<OrcStruct>().createRecordReader(split, attemptContext); // the sarg should cause it to skip over the rows except 1000 to 2000 for (int r = 0; r < 3000; ++r) { assertEquals(true, reader.nextKeyValue()); row = reader.getCurrentValue(); assertEquals(r, ((IntWritable) row.getFieldValue(0)).get()); assertEquals(null, row.getFieldValue(1)); assertEquals(r * 3, ((IntWritable) row.getFieldValue(2)).get()); } assertEquals(false, reader.nextKeyValue()); }
From source file:org.apache.parquet.hadoop.thrift.TestParquetToThriftReadWriteAndProjection.java
License:Apache License
private <T extends TBase<?, ?>> void shouldDoProjection(Configuration conf, T recordToWrite, T exptectedReadResult, Class<? extends TBase<?, ?>> thriftClass) throws Exception { final Path parquetFile = new Path("target/test/TestParquetToThriftReadWriteAndProjection/file.parquet"); final FileSystem fs = parquetFile.getFileSystem(conf); if (fs.exists(parquetFile)) { fs.delete(parquetFile, true);/*from w w w . ja v a 2 s . co m*/ } //create a test file final TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0); final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos)); recordToWrite.write(protocol); w.write(new BytesWritable(baos.toByteArray())); w.close(); final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>(); final Job job = new Job(conf, "read"); job.setInputFormatClass(ParquetThriftInputFormat.class); ParquetThriftInputFormat.setInputPaths(job, parquetFile); final JobID jobID = new JobID("local", 1); List<InputSplit> splits = parquetThriftInputFormat .getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID)); T readValue = null; for (InputSplit split : splits) { TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext( ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0)); final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split, taskAttemptContext); reader.initialize(split, taskAttemptContext); if (reader.nextKeyValue()) { readValue = reader.getCurrentValue(); LOG.info(readValue); } } assertEquals(exptectedReadResult, readValue); }