Example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue

List of usage examples for org.apache.hadoop.mapreduce RecordReader nextKeyValue

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue.

Prototype

public abstract boolean nextKeyValue() throws IOException, InterruptedException;

Source Link

Document

Read the next key, value pair.

Usage

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void readExcelInputFormatBzip2CompressedExcel2013MultiSheetAll()
        throws IOException, InterruptedException {
    Configuration conf = new Configuration(defaultConf);
    CompressionCodec bzip2 = new BZip2Codec();
    ReflectionUtils.setConf(bzip2, conf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "excel2013testmultisheet.xlsx.bz2";
    String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
    Path file = new Path(fileNameSpreadSheet);

    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    Job job = Job.getInstance(conf);/* ww  w .  jav a 2  s . c o m*/
    FileInputFormat.setInputPaths(job, file);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    ExcelFileInputFormat format = new ExcelFileInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 (first sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[excel2013testmultisheet.xlsx.bz2]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[excel2013testmultisheet.xlsx.bz2]Sheet1!A1\"");
    assertEquals(4, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 4 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
            "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
    assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
            "Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[3]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 4 == \"test4\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2 (first sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 2 with 1 column");
    assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 2 with cell 1 == \"4\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3 (first sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(5, spreadSheetValue.get().length, "Input Split for Excel file contains row 3 with 5 columns");
    assertEquals("31/12/99", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"31/12/99\"");
    assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 2 == \"5\"");
    assertNull(spreadSheetValue.get()[2], "Input Split for Excel file contains row 3 with cell 3 == null");
    assertNull(spreadSheetValue.get()[3], "Input Split for Excel file contains row 3 with cell 4 == null");
    assertEquals("null", ((SpreadSheetCellDAO) spreadSheetValue.get()[4]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 5 == \"null\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 4 (first sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 4 with 1 column");
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 4 with cell 1 == \"1\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 5 (first sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 5 with 3 columns");
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 5 with cell 1 == \"2\"");
    assertEquals("6", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 5 with cell 2== \"6\"");
    assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 5 with cell 3== \"10\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 6 (first sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 6 with 3 columns");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 6 with cell 1 == \"3\"");
    assertEquals("4", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 6 with cell 2== \"4\"");
    assertEquals("15", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 6 with cell 3== \"15\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 7 (second sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("8", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 7 with cell 1 == \"8\"");
    assertEquals("99", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 7 with cell 2 == \"99\"");
    assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 7 with 2 columns");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 8 (second sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contains row 8 with 1 column");
    assertEquals("test", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 8 with cell 1 == \"test\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 9 (second sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 9 with 3 columns");
    assertNull(spreadSheetValue.get()[0], "Input Split for Excel file contains row 9 with cell 1 == null");
    assertNull(spreadSheetValue.get()[1], "Input Split for Excel file contains row 9 with cell 2 == null");
    assertEquals("seven", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 9 with cell 3 == \"seven\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void readExcelInputFormatExcel2013MultiSheetHeader() throws IOException, InterruptedException {
    Configuration conf = new Configuration(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "multisheetheader.xlsx";
    String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
    Path file = new Path(fileNameSpreadSheet);
    // set locale to the one of the test data
    conf.set("hadoopoffice.locale.bcp47", "us");
    conf.set("hadoopoffice.read.header.read", "true");
    conf.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
    Job job = Job.getInstance(conf);// ww w. j  ava2 s. com
    FileInputFormat.setInputPaths(job, file);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    ExcelFileInputFormat format = new ExcelFileInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    assertEquals("column1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
            " header column 1 correctly read");
    assertEquals("column2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
            " header column 2 correctly read");
    assertEquals("column3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
            " header column 3 correctly read");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();

    // First Sheet
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A2 = \"1\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B2 = \"test1\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C2 = \"10\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A3 = \"2\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B3 = \"test3\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B3", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("15", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C3 = \"15\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");

    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A4 = \"10\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B4 = \"test2\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B4", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("20", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C4 = \"20\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C4", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");

    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 (Second Sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("50", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A2 = \"50\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B2 = \"test1\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("80", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C2 = \"80\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");

    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2 (Second Sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("60", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A3 = \"60\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B3 = \"test3\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B3", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("90", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C3 = \"90\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");

    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3 (Second Sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("70", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A4 = \"70\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B4 = \"test2\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B4", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("10000", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "C4 = \"10000\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C4", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains no further row");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void readExcelInputFormatExcel2013MultiSheetSkipWithHeader() throws IOException, InterruptedException {
    Configuration conf = new Configuration(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "skipsheet.xlsx";
    String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
    Path file = new Path(fileNameSpreadSheet);
    // set locale to the one of the test data
    conf.set("hadoopoffice.locale.bcp47", "us");
    conf.set("hadoopoffice.read.header.read", "true");
    conf.set("hadoopoffice.read.header.skipheaderinallsheets", "true");
    conf.set("hadoopoffice.read.sheet.skiplines.num", "5");
    conf.set("hadoopoffice.read.sheet.skiplines.allsheets", "true");
    Job job = Job.getInstance(conf);//w  ww  .  ja v a 2 s  . c  o m
    FileInputFormat.setInputPaths(job, file);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    ExcelFileInputFormat format = new ExcelFileInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = format.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    assertEquals("column1", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[0],
            " header column 1 correctly read");
    assertEquals("column2", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[1],
            " header column 2 correctly read");
    assertEquals("column3", ((ExcelRecordReader) reader).getOfficeReader().getCurrentParser().getHeader()[2],
            " header column 3 correctly read");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    // First Sheet
    assertEquals("20", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A7 = \"20\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B7 = \"test2\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B7", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");
    assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C7 = \"5\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C7", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("30", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A8 = \"30\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A8", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B8 = \"test1\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B8", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("10", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C8 = \"10\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C8", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("40", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A9 = \"40\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A9", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B9 = \"test3\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B9", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("15", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C9 = \"15\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C9", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1 (Second Sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("90", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A7 = \"90\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B7 = \"test2\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B7", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("230", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C7 = \"230\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C7", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2 (Second Sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("200", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A8 = \"200\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A8", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B8 = \"test1\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B8", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("240", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C8 = \"240\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C8", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3 (Second Sheet)");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("101", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "A9 = \"101\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormula(), "Empty formula");
    assertEquals("A9", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Correct sheet");

    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "B9 = \"test3\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormula(), "Empty formula");
    assertEquals("B9", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getSheetName(), "Correct sheet");

    assertEquals("250", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "C9 = \"250\"");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getComment(), "Empty Comment");
    assertEquals("", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormula(), "Empty formula");
    assertEquals("C9", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getAddress(), "Correct address");
    assertEquals("Sheet2", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getSheetName(), "Correct sheet");
    // third sheet is skipped because it does not contain enough rows
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains no further row");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheet() throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // empty row => nothing todo
    // one row numbers (1,2,3)
    SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1");
    SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1");
    SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1");
    // one row formulas (=A3+B3)
    SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1");
    // write//  w w w.  j  av  a 2 s . co m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013singlesheettestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);

    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    // set generic outputformat settings
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.write(null, a3);
    writer.write(null, b3);
    writer.write(null, c3);
    writer.write(null, a4);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    FileInputFormat.setInputPaths(job, inputFile);
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 2 and is empty");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contain row 3 with 3 columns");
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"1\"");
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 2 == \"2\"");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 3 == \"3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 4");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contain row 4 with 1 column");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetEncryptedPositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // empty row => nothing todo
    // one row numbers (1,2,3)
    SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1");
    SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1");
    SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1");
    // one row formulas (=A3+B3)
    SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1");
    // write//from ww w  .  j ava2s  .  c om
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013singlesheettestoutencryptedpositive";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);
    // set generic outputformat settings
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // excel
    // format
    // security
    // for the new Excel format you need to decide on your own which algorithms are
    // secure
    conf.set("hadoopoffice.write.security.crypt.encrypt.mode", "agile");
    conf.set("hadoopoffice.write.security.crypt.encrypt.algorithm", "aes256");
    conf.set("hadoopoffice.write.security.crypt.chain.mode", "cbc");
    conf.set("hadoopoffice.write.security.crypt.hash.algorithm", "sha512");
    conf.set("hadoopoffice.write.security.crypt.password", "test");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.write(null, a3);
    writer.write(null, b3);
    writer.write(null, c3);
    writer.write(null, a4);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // you just need to provide the password to read encrypted data
    conf.set("hadoopoffice.read.security.crypt.password", "test");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    FileInputFormat.setInputPaths(job, inputFile);
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 2 and is empty");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contain row 3 with 3 columns");
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"1\"");
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 2 == \"2\"");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 3 == \"3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 4");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contain row 4 with 1 column");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetEncryptedPositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // empty row => nothing todo
    // one row numbers (1,2,3)
    SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1");
    SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1");
    SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1");
    // one row formulas (=A3+B3)
    SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1");
    // write/*from ww w  . j a v a 2s. c  o  m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2003singlesheettestoutencryptedpositive";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old excel format
    // security
    // for the old Excel format you simply need to define only a password
    conf.set("hadoopoffice.write.security.crypt.password", "test");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.write(null, a3);
    writer.write(null, b3);
    writer.write(null, c3);
    writer.write(null, a4);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.read.security.crypt.password", "test");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    FileInputFormat.setInputPaths(job, inputFile);
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xls]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 2");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 2 and is empty");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 3");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contain row 3 with 3 columns");
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"1\"");
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 2 == \"2\"");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 3 == \"3\"");
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 4");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contain row 4 with 1 column");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchAllPositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from  w  w w . j a  v a2s .  c o  m*/
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2013singlesheetmetadatapositivetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "dummycategory");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.revision", "2");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchAllPositive()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//from  ww  w. j a v  a 2 s .  c  o m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2003singlesheetmetadatapositivetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    conf.set("mapreduce.output.basename", fileName);

    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    // conf.set("hadoopoffice.write.metadata.template","dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    // conf.set("hadoopoffice.write.metadata.wordcount","1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "1");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is not true that means the document has (wrongly) been
    // filtered out
    assertTrue(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
    spreadSheetKey = reader.getCurrentKey();
    spreadSheetValue = reader.getCurrentValue();
    assertEquals("[" + fileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetMetaDataMatchAllNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//w w  w.j  ava 2 s. com
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();

    String fileName = "excel2013singlesheetmetadatanegativetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType",
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    // set all the meta data including to custom properties
    conf.set("hadoopoffice.write.metadata.category", "dummycategory");
    conf.set("hadoopoffice.write.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.write.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.write.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.write.metadata.description", "dummydescription");
    conf.set("hadoopoffice.write.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.write.metadata.revision", "2");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hadoopoffice.write.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.category", "no Category");
    conf.set("hadoopoffice.read.filter.metadata.contentstatus", "dummycontentstatus");
    conf.set("hadoopoffice.read.filter.metadata.contenttype", "dummycontenttype");
    conf.set("hadoopoffice.read.filter.metadata.created", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.creator", "dummycreator");
    conf.set("hadoopoffice.read.filter.metadata.description", "dummydescription");
    conf.set("hadoopoffice.read.filter.metadata.identifier", "dummyidentifier");
    conf.set("hadoopoffice.read.filter.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.modified", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.lastmodifiedbyuser", "dummylastmodifiedbyuser");
    conf.set("hadoopoffice.read.filter.metadata.revision", "2");
    conf.set("hadoopoffice.read.filter.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.custom.mycustomproperty1", "dummymycustomproperty1");
    conf.set("hhadoopoffice.read.filter.metadata.custom.mycustomproperty2", "dummymycustomproperty2");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion is true that means the document has wrongly NOT been
    // filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}

From source file:org.zuinnote.hadoop.office.format.mapreduce.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetMetaDataMatchAllNegative()
        throws IOException, InterruptedException {
    // one row string and three columns ("test1","test2","test3")
    // the idea here is to have some content although we only evaluate metadata
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//from w ww  .j  av a 2s . c o m
    Job job = Job.getInstance();
    Configuration conf = job.getConfiguration();
    String fileName = "excel2003singlesheetmetadatanegativetestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    conf.set("mapreduce.output.basename", fileName);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    conf.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); // old Excel format

    // set all the meta data
    conf.set("hadoopoffice.write.metadata.applicationname", "dummyapplicationname");
    conf.set("hadoopoffice.write.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.write.metadata.charcount", "1");
    conf.set("hadoopoffice.write.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.write.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.edittime", "0");
    conf.set("hadoopoffice.write.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.write.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.write.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.write.metadata.pagecount", "1");
    conf.set("hadoopoffice.write.metadata.revnumber", "1");
    conf.set("hadoopoffice.write.metadata.security", "0");
    conf.set("hadoopoffice.write.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.write.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.write.metadata.title", "dummytitle");
    conf.set("hadoopoffice.write.metadata.wordcount", "1");
    conf.set(MRJobConfig.TASK_ATTEMPT_ID, attempt);
    conf.setInt(MRJobConfig.APPLICATION_ATTEMPT_ID, 0);
    conf.setInt(FileOutputCommitter.FILEOUTPUTCOMMITTER_ALGORITHM_VERSION, 1);
    FileOutputFormat.setOutputPath(job, outputPath);
    JobContext jContext = new JobContextImpl(conf, taskID.getJobID());

    TaskAttemptContext context = new TaskAttemptContextImpl(conf, taskID);
    FileOutputCommitter committer = new FileOutputCommitter(outputPath, context);
    // setup
    committer.setupJob(jContext);
    committer.setupTask(context);
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(context);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.close(context);
    committer.commitTask(context);
    // try to read it again
    conf = new Configuration(defaultConf);
    job = Job.getInstance(conf);
    fileName = fileName + this.outputbaseAppendix;
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + taskAttempt + File.separator + fileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    conf.set("hadoopoffice.read.locale.bcp47", "de");
    // set metadata to match all
    conf.set("hadoopoffice.read.filter.metadata.matchAll", "true");
    // following filter
    conf.set("hadoopoffice.read.filter.metadata.applicationname", "dummyapplicationname2");
    conf.set("hadoopoffice.read.filter.metadata.metadata.author", "dummyauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.charcount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.comments", "dummycomments");
    conf.set("hadoopoffice.read.filter.metadata.metadata.createdatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.edittime", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.keywords", "dummykeywords");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastauthor", "dummylastauthor");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastprinted", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.lastsavedatetime", "12:00:00 01.01.2016");
    conf.set("hadoopoffice.read.filter.metadata.metadata.pagecount", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.revnumber", "1");
    conf.set("hadoopoffice.read.filter.metadata.metadata.security", "0");
    conf.set("hadoopoffice.read.filter.metadata.metadata.subject", "dummysubject");
    conf.set("hadoopoffice.read.filter.metadata.metadata.template", "dummytemplate");
    conf.set("hadoopoffice.read.filter.metadata.metadata.title", "dummytitle");
    conf.set("hadoopoffice.read.filter.metadata.metadata.wordcount", "1");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    List<InputSplit> splits = inputFormat.getSplits(job);
    assertEquals(1, splits.size(), "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.createRecordReader(splits.get(0), context);
    assertNotNull(reader, "Format returned  null RecordReader");
    reader.initialize(splits.get(0), context);
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    // if following assertion not true that means the document has (wrongly) NOT
    // been filtered out
    assertFalse(reader.nextKeyValue(), "Input Split for Excel file contains row 1");
}