Example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter FileOutputCommitter

List of usage examples for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter FileOutputCommitter

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.output FileOutputCommitter FileOutputCommitter.

Prototype

@Private
public FileOutputCommitter(Path outputPath, JobContext context) throws IOException 

Source Link

Document

Create a file output committer

Usage

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetEncryptedNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // empty row => nothing todo
    // one row numbers (1,2,3)
    SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1");
    SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1");
    SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1");
    // one row formulas (=A3+B3)
    SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1");
    // write/*from   www.  j av a  2s. c o m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2003singlesheettestoutencryptedpositive";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old excel format
    // security
    // for the old Excel format you simply need to define only a password
    conf.set("hadoopoffice.write.security.crypt.password", "test");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.write(null, a3);
    writer.write(null, b3);
    writer.write(null, c3);
    writer.write(null, a4);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.read.security.crypt.password", "test2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    FileInputFormat.setInputPaths(job, inputFile);
    TaskAttemptContext context2 = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context2);
    InterruptedException ex = assertThrows(InterruptedException.class,
            () -> reader.initialize(splits.get(0), context2), "Exception is thrown in case of wrong password");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchAllPositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from   w ww  . j  a  v a 2 s  .  c  o  m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2013singlesheetmetadatapositivetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "dummycategory");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.revision", "2");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchAllPositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//w  w  w .  j  ava 2 s.c  o  m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2003singlesheetmetadatapositivetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);

    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    // conf.set("hadoopoffice.write.metadata.template","dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    // conf.set("hadoopoffice.write.metadata.wordcount","1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "1");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchAllNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//ww w .  ja v a  2 s . c o m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013singlesheetmetadatanegativetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "no Category");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.revision", "2");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is true that means the document has wrongly NOT been
    // filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchAllNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//from ww  w  .  j  a v a2  s.  com
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2003singlesheetmetadatanegativetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.wordcount", "1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "1");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion not true that means the document has (wrongly) NOT
    // been filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchOncePositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from w ww. java2s . co  m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2013singlesheetmetadatapositiveoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "dummycategory");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus2");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype2");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator2");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription2");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier2");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.revision", "3");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty12");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty22");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchOncePositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from   www  .  jav  a  2  s . c  o m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2003singlesheetmetadatapositiveoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.wordcount", "1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyautho2r");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchOnceNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*  www  .  ja  va  2s.  c om*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2013singlesheetmetadatanativeoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "dummycategory2");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus2");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype2");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator2");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription2");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier2");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser2");
    conf.set("hadoopoffice.read.filter.metadata.revision", "3");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty12");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty22");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is true that means the document has (wrongly) NOT been
    // filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchOnceNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write// w  w w.ja  v a 2s  .c  o  m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2003singlesheetmetadatanegativeoncetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.wordcount", "1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "false");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyautho2r");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2017");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is true that means the document has (wrongly) NOT been
    // filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetGZipCompressed()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // empty row => nothing todo
    // one row numbers (1,2,3)
    SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1");
    SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1");
    SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1");
    // one row formulas (=A3+B3)
    SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1");
    // write//from  w w w  .j  av  a  2 s  .com
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013singlesheetcompressedtestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    conf.setBoolean("mapreduce.output.fileoutputformat.compress", true);
    conf.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.write(null, a3);
    writer.write(null, b3);
    writer.write(null, c3);
    writer.write(null, a4);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx.gz");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx.gz]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx.gz]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 2 and is empty");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contain row 3 with 3 columns");
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"1\"");
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 2 == \"2\"");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 3 == \"3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 4");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contain row 4 with 1 column");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"3\"");
}