Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter commitJob

List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter commitJob

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter commitJob.

Prototype

public void commitJob(JobContext context) throws IOException 

Source Link

Document

The job has completed, so do works in commitJobInternal().

Usage

From source file:org.apache.tajo.storage.hbase.HBaseStorageManager.java

License:Apache License

@Override
public Path commitOutputData(OverridableConf queryContext, ExecutionBlockId finalEbId, LogicalPlan plan,
        Schema schema, TableDesc tableDesc) throws IOException {
    if (tableDesc == null) {
        throw new IOException("TableDesc is null while calling loadIncrementalHFiles: " + finalEbId);
    }/*from   w w w.  j  av a2  s . com*/
    Path stagingDir = new Path(queryContext.get(QueryVars.STAGING_DIR));
    Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME);

    Configuration hbaseConf = HBaseStorageManager.getHBaseConfiguration(queryContext.getConf(),
            tableDesc.getMeta());
    hbaseConf.set("hbase.loadincremental.threads.max", "2");

    JobContextImpl jobContext = new JobContextImpl(hbaseConf,
            new JobID(finalEbId.getQueryId().toString(), finalEbId.getId()));

    FileOutputCommitter committer = new FileOutputCommitter(stagingResultDir, jobContext);
    Path jobAttemptPath = committer.getJobAttemptPath(jobContext);
    FileSystem fs = jobAttemptPath.getFileSystem(queryContext.getConf());
    if (!fs.exists(jobAttemptPath) || fs.listStatus(jobAttemptPath) == null) {
        LOG.warn("No query attempt file in " + jobAttemptPath);
        return stagingResultDir;
    }
    committer.commitJob(jobContext);

    if (tableDesc.getName() == null && tableDesc.getPath() != null) {

        // insert into location
        return super.commitOutputData(queryContext, finalEbId, plan, schema, tableDesc, false);
    } else {
        // insert into table
        String tableName = tableDesc.getMeta().getOption(HBaseStorageConstants.META_TABLE_KEY);

        HTable htable = new HTable(hbaseConf, tableName);
        try {
            LoadIncrementalHFiles loadIncrementalHFiles = null;
            try {
                loadIncrementalHFiles = new LoadIncrementalHFiles(hbaseConf);
            } catch (Exception e) {
                LOG.error(e.getMessage(), e);
                throw new IOException(e.getMessage(), e);
            }
            loadIncrementalHFiles.doBulkLoad(stagingResultDir, htable);

            return stagingResultDir;
        } finally {
            htable.close();
        }
    }
}

From source file:org.apache.tajo.storage.hbase.HBaseTablespace.java

License:Apache License

@Override
public Path commitTable(OverridableConf queryContext, ExecutionBlockId finalEbId, LogicalPlan plan,
        Schema schema, TableDesc tableDesc) throws IOException {
    if (tableDesc == null) {
        throw new IOException("TableDesc is null while calling loadIncrementalHFiles: " + finalEbId);
    }/*from  w  ww .  j  av  a2  s.co  m*/

    Path stagingDir = new Path(queryContext.get(QueryVars.STAGING_DIR));
    Path stagingResultDir = new Path(stagingDir, TajoConstants.RESULT_DIR_NAME);

    Configuration hbaseConf = HBaseConfiguration.create(this.hbaseConf);
    hbaseConf.set("hbase.loadincremental.threads.max", "2");

    JobContextImpl jobContext = new JobContextImpl(hbaseConf,
            new JobID(finalEbId.getQueryId().toString(), finalEbId.getId()));

    FileOutputCommitter committer = new FileOutputCommitter(stagingResultDir, jobContext);
    Path jobAttemptPath = committer.getJobAttemptPath(jobContext);
    FileSystem fs = jobAttemptPath.getFileSystem(queryContext.getConf());
    if (!fs.exists(jobAttemptPath) || fs.listStatus(jobAttemptPath) == null) {
        LOG.warn("No query attempt file in " + jobAttemptPath);
        return stagingResultDir;
    }
    committer.commitJob(jobContext);

    // insert into table
    String tableName = tableDesc.getMeta().getOption(HBaseStorageConstants.META_TABLE_KEY);

    HTable htable = new HTable(hbaseConf, tableName);
    try {
        LoadIncrementalHFiles loadIncrementalHFiles = null;
        try {
            loadIncrementalHFiles = new LoadIncrementalHFiles(hbaseConf);
        } catch (Exception e) {
            LOG.error(e.getMessage(), e);
            throw new IOException(e.getMessage(), e);
        }
        loadIncrementalHFiles.doBulkLoad(stagingResultDir, htable);

        return stagingResultDir;
    } finally {
        htable.close();
    }
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetOneLinkedWorkbook()
        throws IOException, InterruptedException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*w  w w  . ja v  a  2  s .c o m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String linkedWB1FileName = "excel2003linkedwb1";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", linkedWB1FileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // write mainworkbook
    linkedWB1FileName = linkedWB1FileName + this.outputbaseAppendix;
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + linkedWB1FileName + ".xls]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xls]Sheet1!B1", "B1",
            "Sheet1"); // should be test2 in the end
    // write
    job = Job.getInstance();
    conf = job.getConfiguration();
    String mainWBfileName = "excel2003singlesheetlinkedwbtestout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", mainWBfileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    jContext = new JobContextImpl(conf, taskID.getJobID());

    context = new TaskAttemptContextImpl(conf, taskID);
    committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(context);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    mainWBfileName = mainWBfileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + mainWBfileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    conf.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    conf.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + mainWBfileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xls]Sheet1!A1\"");
    assertEquals(2, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 2 columns for Sheet1");
    assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test4\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetTwoLinkedWorkbooks()
        throws IOException, InterruptedException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*ww w.  jav a2  s .  com*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String linkedWB1FileName = "excel2003linkedwb1b";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", linkedWB1FileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // write linkedworkbook2
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet1");
    // write
    job = Job.getInstance();
    conf = job.getConfiguration();
    String linkedWB2FileName = "excel2003linkedwb2b";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", linkedWB2FileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    jContext = new JobContextImpl(conf, taskID.getJobID());

    context = new TaskAttemptContextImpl(conf, taskID);
    committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    outputFormat = new ExcelFileOutputFormat();
    writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb2a1);
    writer.write(null, wb2b1);
    writer.write(null, wb2c1);
    writer.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // write mainworkbook
    linkedWB1FileName = linkedWB1FileName + this.outputbaseAppendix;
    linkedWB2FileName = linkedWB2FileName + this.outputbaseAppendix;
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + linkedWB1FileName + ".xls]:[" + tmpDir
            + File.separator + linkedWB2FileName + ".xls]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test7", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xls]Sheet1!B1", "B1",
            "Sheet1"); // should be test2 in the end
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("", "", "[" + linkedWB2FileName + ".xls]Sheet1!B1", "C1",
            "Sheet1"); // should be test5 in the end
    // write
    job = Job.getInstance();
    conf = job.getConfiguration();
    String mainWBfileName = "excel2003singlesheetlinkedwb2testout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", mainWBfileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    jContext = new JobContextImpl(conf, taskID.getJobID());

    context = new TaskAttemptContextImpl(conf, taskID);
    committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(context);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.write(null, c1);
    writerMain.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    mainWBfileName = mainWBfileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + mainWBfileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    conf.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    conf.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + mainWBfileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xls]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 3 columns for Sheet1");
    assertEquals("test7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test7\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test5\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184")
@Test// w  w  w .j  a v  a  2s  . c  o m
public void writeExcelOutputFormatExcel2013SingleSheetOneLinkedWorkbook()
        throws IOException, InterruptedException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String linkedWB1FileName = "excel2013linkedwb1";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", linkedWB1FileName);

    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // write mainworkbook
    linkedWB1FileName = linkedWB1FileName + this.outputbaseAppendix;
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + linkedWB1FileName + ".xlsx]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xlsx]Sheet1!B1", "B1",
            "Sheet1"); // should be test2 in the end
    // write
    job = Job.getInstance();
    conf = job.getConfiguration();

    String mainWBfileName = "excel2013singlesheetlinkedwbtestout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", mainWBfileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    jContext = new JobContextImpl(conf, taskID.getJobID());

    context = new TaskAttemptContextImpl(conf, taskID);
    committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(context);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // try to read it again
    job = Job.getInstance(conf);
    mainWBfileName = mainWBfileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + mainWBfileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    conf.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    conf.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\"");
    assertEquals(2, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 2 columns for Sheet1");
    assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test4\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184")
@Test/*from   w  w  w. j a  v  a  2 s  .co m*/
public void writeExcelOutputFormatExcel2013SingleSheetTwoLinkedWorkbooks()
        throws IOException, InterruptedException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String linkedWB1FileName = "excel2013linkedwb1";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", linkedWB1FileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // write linkedworkbook2
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet1");
    // write
    job = Job.getInstance();
    conf = job.getConfiguration();
    String linkedWB2FileName = "excel2013linkedwb2";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", linkedWB2FileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    jContext = new JobContextImpl(conf, taskID.getJobID());

    context = new TaskAttemptContextImpl(conf, taskID);
    committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.commitTask(context);
    committer.commitJob(jContext);
    outputFormat = new ExcelFileOutputFormat();
    writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb2a1);
    writer.write(null, wb2b1);
    writer.write(null, wb2c1);
    writer.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // write mainworkbook
    linkedWB1FileName = linkedWB1FileName + this.outputbaseAppendix;
    linkedWB2FileName = linkedWB2FileName + this.outputbaseAppendix;
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + linkedWB1FileName + ".xlsx]:[" + tmpDir
            + File.separator + linkedWB2FileName + ".xlsx]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test7", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB1FileName + ".xlsx]Sheet1'!B1", "B1",
            "Sheet1"); // should be test2 in the end
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB2FileName + ".xlsx]Sheet1'!B1", "B1",
            "Sheet1"); // should be test5 in the end
    // write
    job = Job.getInstance();
    conf = job.getConfiguration();
    String mainWBfileName = "excel2013singlesheetlinkedwbtestout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", mainWBfileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    jContext = new JobContextImpl(conf, taskID.getJobID());

    context = new TaskAttemptContextImpl(conf, taskID);
    committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);

    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(context);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.write(null, c1);
    writerMain.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // try to read it again
    job = Job.getInstance(conf);
    mainWBfileName = mainWBfileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + mainWBfileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    conf.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    conf.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\"");
    assertEquals(2, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 2 columns for Sheet1");
    assertEquals("test7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test7\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test5\"");
}