List of usage examples for org.apache.hadoop.mapreduce RecordReader initialize
public abstract void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException;
From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184") @Test//from ww w. j av a 2 s . c om public void writeExcelOutputFormatExcel2013SingleSheetOneLinkedWorkbook() throws IOException, InterruptedException { // write linkedworkbook1 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1"); // write Job job = Job.getInstance(); Configuration conf = job.getConfiguration(); String linkedWB1FileName = "excel2013linkedwb1"; String tmpDir = tmpPath.toString(); Path outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); conf.set("mapreduce.output.basename", linkedWB1FileName); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47", "de"); conf.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt); conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1); FileOutputFormat.setOutputPath(job, outputPath); JobContext jContext = new JobContextImpl(conf, taskID.getJobID()); TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outputPath, context); // setup committer.setupJob(jContext); committer.setupTask(context); ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb1a1); writer.write(null, wb1b1); writer.write(null, wb1c1); writer.close(context); committer.commitTask(context); committer.commitJob(jContext); // write mainworkbook linkedWB1FileName = linkedWB1FileName + this.outputbaseAppendix; String linkedWorkbookFilename = "[" + tmpDir + File.separator + linkedWB1FileName + ".xlsx]"; SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1"); SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xlsx]Sheet1!B1", "B1", "Sheet1"); // should be test2 in the end // write job = Job.getInstance(); conf = job.getConfiguration(); String mainWBfileName = "excel2013singlesheetlinkedwbtestout"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); conf.set("mapreduce.output.basename", mainWBfileName); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47", "de"); conf.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes conf.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename); conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt); conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1); FileOutputFormat.setOutputPath(job, outputPath); jContext = new JobContextImpl(conf, taskID.getJobID()); context = new TaskAttemptContextImpl(conf, taskID); committer = new FileOutputCommitter(outputPath, context); // setup committer.setupJob(jContext); committer.setupTask(context); outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(context); assertNotNull(writerMain, "Format returned null RecordWriter"); writerMain.write(null, a1); writerMain.write(null, b1); writerMain.close(context); committer.commitTask(context); committer.commitJob(jContext); // try to read it again job = Job.getInstance(conf); mainWBfileName = mainWBfileName + this.outputbaseAppendix; Path inputFile = new Path(tmpDir + File.separator + mainWBfileName + ".xlsx"); FileInputFormat.setInputPaths(job, inputFile); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks conf.setBoolean("hadoopoffice.read.linkedworkbooks", true); conf.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat inputFormat = new ExcelFileInputFormat(); context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); List<InputSplit> splits = inputFormat.getSplits(job); assertEquals(1, splits.size(), "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context); assertNotNull(reader, "Format returned null RecordReader"); reader.initialize(splits.get(0), context); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1"); spreadSheetKey = reader.getCurrentKey(); spreadSheetValue = reader.getCurrentValue(); assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\""); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns for Sheet1"); assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test4\""); // this comes from the external workbook assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); }
From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184") @Test// ww w . j a v a 2 s. co m public void writeExcelOutputFormatExcel2013SingleSheetTwoLinkedWorkbooks() throws IOException, InterruptedException { // write linkedworkbook1 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1"); // write Job job = Job.getInstance(); Configuration conf = job.getConfiguration(); String linkedWB1FileName = "excel2013linkedwb1"; String tmpDir = tmpPath.toString(); Path outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); conf.set("mapreduce.output.basename", linkedWB1FileName); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47", "de"); conf.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt); conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1); FileOutputFormat.setOutputPath(job, outputPath); JobContext jContext = new JobContextImpl(conf, taskID.getJobID()); TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID); FileOutputCommitter committer = new FileOutputCommitter(outputPath, context); // setup committer.setupJob(jContext); committer.setupTask(context); ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb1a1); writer.write(null, wb1b1); writer.write(null, wb1c1); writer.close(context); committer.commitTask(context); committer.commitJob(jContext); // write linkedworkbook2 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet1"); // write job = Job.getInstance(); conf = job.getConfiguration(); String linkedWB2FileName = "excel2013linkedwb2"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); conf.set("mapreduce.output.basename", linkedWB2FileName); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47", "de"); conf.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt); conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1); FileOutputFormat.setOutputPath(job, outputPath); jContext = new JobContextImpl(conf, taskID.getJobID()); context = new TaskAttemptContextImpl(conf, taskID); committer = new FileOutputCommitter(outputPath, context); // setup committer.commitTask(context); committer.commitJob(jContext); outputFormat = new ExcelFileOutputFormat(); writer = outputFormat.getRecordWriter(context); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb2a1); writer.write(null, wb2b1); writer.write(null, wb2c1); writer.close(context); committer.commitTask(context); committer.commitJob(jContext); // write mainworkbook linkedWB1FileName = linkedWB1FileName + this.outputbaseAppendix; linkedWB2FileName = linkedWB2FileName + this.outputbaseAppendix; String linkedWorkbookFilename = "[" + tmpDir + File.separator + linkedWB1FileName + ".xlsx]:[" + tmpDir + File.separator + linkedWB2FileName + ".xlsx]"; SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test7", "", "", "A1", "Sheet1"); SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB1FileName + ".xlsx]Sheet1'!B1", "B1", "Sheet1"); // should be test2 in the end SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB2FileName + ".xlsx]Sheet1'!B1", "B1", "Sheet1"); // should be test5 in the end // write job = Job.getInstance(); conf = job.getConfiguration(); String mainWBfileName = "excel2013singlesheetlinkedwbtestout"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); conf.set("mapreduce.output.basename", mainWBfileName); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47", "de"); conf.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes conf.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename); conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt); conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0); conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1); FileOutputFormat.setOutputPath(job, outputPath); jContext = new JobContextImpl(conf, taskID.getJobID()); context = new TaskAttemptContextImpl(conf, taskID); committer = new FileOutputCommitter(outputPath, context); // setup committer.setupJob(jContext); committer.setupTask(context); outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(context); assertNotNull(writerMain, "Format returned null RecordWriter"); writerMain.write(null, a1); writerMain.write(null, b1); writerMain.write(null, c1); writerMain.close(context); committer.commitTask(context); committer.commitJob(jContext); // try to read it again job = Job.getInstance(conf); mainWBfileName = mainWBfileName + this.outputbaseAppendix; Path inputFile = new Path(tmpDir + File.separator + mainWBfileName + ".xlsx"); FileInputFormat.setInputPaths(job, inputFile); // set locale to the one of the test data conf.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks conf.setBoolean("hadoopoffice.read.linkedworkbooks", true); conf.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat inputFormat = new ExcelFileInputFormat(); context = new TaskAttemptContextImpl(conf, new TaskAttemptID()); List<InputSplit> splits = inputFormat.getSplits(job); assertEquals(1, splits.size(), "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context); assertNotNull(reader, "Format returned null RecordReader"); reader.initialize(splits.get(0), context); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1"); spreadSheetKey = reader.getCurrentKey(); spreadSheetValue = reader.getCurrentValue(); assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\""); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns for Sheet1"); assertEquals("test7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test7\""); // this comes from the external workbook assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test5\""); }
From source file:parquet.hadoop.thrift.TestParquetToThriftReadProjection.java
License:Apache License
private <T extends TBase<?, ?>> void shouldDoProjection(Configuration conf, T recordToWrite, T exptectedReadResult, Class<? extends TBase<?, ?>> thriftClass) throws Exception { final Path parquetFile = new Path("target/test/TestParquetToThriftReadProjection/file.parquet"); final FileSystem fs = parquetFile.getFileSystem(conf); if (fs.exists(parquetFile)) { fs.delete(parquetFile, true);//ww w .j av a 2 s .c o m } //create a test file final TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0); final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, thriftClass); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos)); recordToWrite.write(protocol); w.write(new BytesWritable(baos.toByteArray())); w.close(); final ParquetThriftInputFormat<T> parquetThriftInputFormat = new ParquetThriftInputFormat<T>(); final Job job = new Job(conf, "read"); job.setInputFormatClass(ParquetThriftInputFormat.class); ParquetThriftInputFormat.setInputPaths(job, parquetFile); final JobID jobID = new JobID("local", 1); List<InputSplit> splits = parquetThriftInputFormat .getSplits(ContextUtil.newJobContext(ContextUtil.getConfiguration(job), jobID)); T readValue = null; for (InputSplit split : splits) { TaskAttemptContext taskAttemptContext = ContextUtil.newTaskAttemptContext( ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0)); final RecordReader<Void, T> reader = parquetThriftInputFormat.createRecordReader(split, taskAttemptContext); reader.initialize(split, taskAttemptContext); if (reader.nextKeyValue()) { readValue = reader.getCurrentValue(); LOG.info(readValue); } } assertEquals(exptectedReadResult, readValue); }
From source file:parquet.scrooge.ParquetScroogeSchemeTest.java
License:Apache License
public <T> void verifyScroogeRead(TBase recordToWrite, Class<T> readClass, String expectedStr, String projectionFilter) throws Exception { Configuration conf = new Configuration(); conf.set("parquet.thrift.converter.class", ScroogeRecordConverter.class.getName()); conf.set(ThriftReadSupport.THRIFT_READ_CLASS_KEY, readClass.getName()); conf.set(ThriftReadSupport.THRIFT_COLUMN_FILTER_KEY, projectionFilter); final Path parquetFile = new Path("target/test/TestParquetToThriftReadProjection/file.parquet"); final FileSystem fs = parquetFile.getFileSystem(conf); if (fs.exists(parquetFile)) { fs.delete(parquetFile, true);/*from w ww . ja v a 2s . c om*/ } //create a test file final TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); final TaskAttemptID taskId = new TaskAttemptID("local", 0, true, 0, 0); Class writeClass = recordToWrite.getClass(); final ThriftToParquetFileWriter w = new ThriftToParquetFileWriter(parquetFile, ContextUtil.newTaskAttemptContext(conf, taskId), protocolFactory, writeClass); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); final TProtocol protocol = protocolFactory.getProtocol(new TIOStreamTransport(baos)); recordToWrite.write(protocol); w.write(new BytesWritable(baos.toByteArray())); w.close(); final ParquetScroogeInputFormat<T> parquetScroogeInputFormat = new ParquetScroogeInputFormat<T>(); final Job job = new Job(conf, "read"); job.setInputFormatClass(ParquetThriftInputFormat.class); ParquetThriftInputFormat.setInputPaths(job, parquetFile); final JobID jobID = new JobID("local", 1); List<InputSplit> splits = parquetScroogeInputFormat .getSplits(new JobContext(ContextUtil.getConfiguration(job), jobID)); T readValue = null; for (InputSplit split : splits) { TaskAttemptContext taskAttemptContext = new TaskAttemptContext(ContextUtil.getConfiguration(job), new TaskAttemptID(new TaskID(jobID, true, 1), 0)); final RecordReader<Void, T> reader = parquetScroogeInputFormat.createRecordReader(split, taskAttemptContext); reader.initialize(split, taskAttemptContext); if (reader.nextKeyValue()) { readValue = reader.getCurrentValue(); } } assertEquals(expectedStr, readValue.toString()); }
From source file:reconcile.hbase.mapreduce.ZipInputFormat.java
License:Open Source License
@Override public RecordReader<Text, Text> createRecordReader(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException { RecordReader<Text, Text> reader = new ZipEntryRecordReader(); reader.initialize(arg0, arg1); return reader; }
From source file:uk.bl.wa.hadoop.mapreduce.lib.DereferencingArchiveToCDXRecordReaderTest.java
License:Open Source License
private void runCDXTest(Configuration conf, String expected) throws Exception { File testFile = new File("src/test/resources/rr-test-inputs.txt"); Path path = new Path(testFile.getAbsoluteFile().toURI().toString()); FileSplit split = new FileSplit(path, 0, testFile.length(), null); ArchiveToCDXFileInputFormat inputFormat = ReflectionUtils.newInstance(ArchiveToCDXFileInputFormat.class, conf);/*from w w w.j av a2 s . c o m*/ TaskAttemptContext context = new TaskAttemptContext(conf, new TaskAttemptID()); RecordReader<Text, Text> reader = inputFormat.createRecordReader(split, context); reader.initialize(split, context); int position = 0; String value = ""; while (reader.nextKeyValue() != false) { position += 1; if (position == 3) value = reader.getCurrentValue().toString(); } // Check the third value is as expected log.debug(value); Assert.assertEquals(expected, value); }