Example usage for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl

List of usage examples for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.task TaskAttemptContextImpl TaskAttemptContextImpl.

Prototype

public TaskAttemptContextImpl(Configuration conf, TaskAttemptID taskId) 

Source Link

Usage

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchAllNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//  w w w  .  j a va2 s. c  o m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013singlesheetmetadatanegativetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "no Category");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.revision", "2");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is true that means the document has wrongly NOT been
    // filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchAllNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from w  w w . j ava2 s  . com*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2003singlesheetmetadatanegativetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.wordcount", "1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "1");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion not true that means the document has (wrongly) NOT
    // been filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchOncePositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//from  ww  w .ja v  a2 s  . co  m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2013singlesheetmetadatapositiveoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "dummycategory");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus2");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype2");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator2");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription2");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier2");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.revision", "3");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty12");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty22");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchOncePositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from  ww  w.jav  a 2 s.  co  m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2003singlesheetmetadatapositiveoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.wordcount", "1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyautho2r");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchOnceNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*  w  w  w  . java2 s.  co m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2013singlesheetmetadatanativeoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "dummycategory2");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus2");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype2");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator2");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription2");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier2");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.revision", "3");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty12");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty22");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is true that means the document has (wrongly) NOT been
    // filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchOnceNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from  ww w. ja v  a2  s. c  om*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2003singlesheetmetadatanegativeoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.wordcount", "1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyautho2r");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is true that means the document has (wrongly) NOT been
    // filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetGZipCompressed()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // empty row => nothing todo
    // one row numbers (1,2,3)
    SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1");
    SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1");
    SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1");
    // one row formulas (=A3+B3)
    SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1");
    // write//from  ww w . ja va 2 s  . c o  m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013singlesheetcompressedtestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
    conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.write(null, a3);
    writer.write(null, b3);
    writer.write(null, c3);
    writer.write(null, a4);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx.gz");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx.gz]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx.gz]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 2 and is empty");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contain row 3 with 3 columns");
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"1\"");
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 2 == \"2\"");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 3 == \"3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 4");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contain row 4 with 1 column");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetComment() throws IOException, InterruptedException {
    // 2nd cell with a comment
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "This is a test", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write// w w w.  j  ava 2  s .c om
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2013singlesheetcommenttestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("This is a test", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(),
            "Input Split for Excel file contains row 1 with cell 2 comment == \"This is a test\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");

}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013MultiSheet() throws IOException, InterruptedException {
    // one sheet "Sheet1"
    // one row string and three columns ("test1","test2","test3")
    SpreadSheetCellDAO sheet1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO sheet1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO sheet1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // one sheet "Sheet2"
    // one row string and three columns ("test4","test5","test6")
    SpreadSheetCellDAO sheet2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet2");
    SpreadSheetCellDAO sheet2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet2");
    SpreadSheetCellDAO sheet2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet2");
    // write/*from w w w.  ja v a 2s .c o m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013multisheettestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, sheet1a1);
    writer.write(null, sheet1b1);
    writer.write(null, sheet1c1);
    writer.write(null, sheet2a1);
    writer.write(null, sheet2b1);
    writer.write(null, sheet2c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 3 columns for Sheet1");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet2!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet2!A1\"");
    assertEquals(3, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 3 columns for Sheet1");
    assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test4\"");
    assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test5\"");
    assertEquals("test6", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test6\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetOneLinkedWorkbook()
        throws IOException, InterruptedException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from  w  w w  .  j a  v a  2 s  . co m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String linkedWB1FileName = "excel2003linkedwb1";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", linkedWB1FileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // write mainworkbook
    linkedWB1FileName = linkedWB1FileName + this.outputbaseAppendix;
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + linkedWB1FileName + ".xls]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xls]Sheet1!B1", "B1",
            "Sheet1"); // should be test2 in the end
    // write
    job = Job.getInstance();
    conf = job.getConfiguration();
    String mainWBfileName = "excel2003singlesheetlinkedwbtestout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", mainWBfileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    conf.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    jContext = new JobContextImpl(conf, taskID.getJobID());

    context = new TaskAttemptContextImpl(conf, taskID);
    committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(context);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.close(context);
    committer.commitTask(context);
    committer.commitJob(jContext);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    mainWBfileName = mainWBfileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + mainWBfileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    conf.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    conf.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 Sheet1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + mainWBfileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xls]Sheet1!A1\"");
    assertEquals(2, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 2 columns for Sheet1");
    assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test4\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
}