Example usage for org.apache.hadoop.mapred JobConf setBoolean

List of usage examples for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value) 

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetOneLinkedWorkbook() throws IOException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write/*from  w  w w .  jav a2  s.co m*/
    JobConf job = new JobConf(defaultConf);
    String linkedWB1FileName = "excel2003linkedwb1";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job,
            linkedWB1FileName, null);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(reporter);
    // write mainworkbook
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0"
            + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName
            + ".xls]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xls]Sheet1!B1", "B1",
            "Sheet1"); // should be test2 in the end
    // write
    job = new JobConf(defaultConf);
    String mainWBfileName = "excel2003singlesheetlinkedwbtestout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job,
            mainWBfileName, null);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.close(reporter);
    // try to read it again
    job = new JobConf(defaultConf);
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    inputFormat.configure(job);
    InputSplit[] inputSplits = inputFormat.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue),
            "Input Split for Excel file contains row 1 Sheet1");
    assertEquals("[" + mainWBfileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xls]Sheet1!A1\"");
    assertEquals(2, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 2 columns for Sheet1");
    assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test4\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2003SingleSheetTwoLinkedWorkbooks() throws IOException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write//from www  .  j a  va  2s .  co  m
    JobConf job = new JobConf(defaultConf);
    String linkedWB1FileName = "excel2003linkedwb1b";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job,
            linkedWB1FileName, null);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(reporter);
    // write linkedworkbook2
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet1");
    // write
    job = new JobConf(defaultConf);
    String linkedWB2FileName = "excel2003linkedwb2b";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    outputFormat = new ExcelFileOutputFormat();
    writer = outputFormat.getRecordWriter(null, job, linkedWB2FileName, null);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb2a1);
    writer.write(null, wb2b1);
    writer.write(null, wb2c1);
    writer.close(reporter);
    // write mainworkbook
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0"
            + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName
            + ".xls]:[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + "_temporary" + File.separator + attempt + File.separator + linkedWB2FileName + ".xls]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test7", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xls]Sheet1!B1", "B1",
            "Sheet1"); // should be test2 in the end
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("", "", "[" + linkedWB2FileName + ".xls]Sheet1!B1", "C1",
            "Sheet1"); // should be test5 in the end
    // write
    job = new JobConf(defaultConf);
    String mainWBfileName = "excel2003singlesheetlinkedwb2testout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel");
    job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job,
            mainWBfileName, null);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.write(null, c1);
    writerMain.close(reporter);
    // try to read it again
    job = new JobConf(defaultConf);
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xls");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    inputFormat.configure(job);
    InputSplit[] inputSplits = inputFormat.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue),
            "Input Split for Excel file contains row 1 Sheet1");
    assertEquals("[" + mainWBfileName + ".xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xls]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 3 columns for Sheet1");
    assertEquals("test7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test7\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test5\"");
}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184")
@Test//w  w w  . j ava2 s.c o  m
public void writeExcelOutputFormatExcel2013SingleSheetOneLinkedWorkbook() throws IOException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write
    JobConf job = new JobConf(defaultConf);
    String linkedWB1FileName = "excel2013linkedwb1";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job,
            linkedWB1FileName, null);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(reporter);
    // write mainworkbook
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0"
            + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName
            + ".xlsx]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xlsx]Sheet1!B1", "B1",
            "Sheet1"); // should be test2 in the end
    // write
    job = new JobConf(defaultConf);
    String mainWBfileName = "excel2013singlesheetlinkedwbtestout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job,
            mainWBfileName, null);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.close(reporter);
    // try to read it again
    job = new JobConf(defaultConf);
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    inputFormat.configure(job);
    InputSplit[] inputSplits = inputFormat.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue),
            "Input Split for Excel file contains row 1 Sheet1");
    assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\"");
    assertEquals(2, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 2 columns for Sheet1");
    assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test4\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184")
@Test//ww  w .ja  v a2 s.  c om
public void writeExcelOutputFormatExcel2013SingleSheetTwoLinkedWorkbooks() throws IOException {
    // write linkedworkbook1
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // write
    JobConf job = new JobConf(defaultConf);
    String linkedWB1FileName = "excel2013linkedwb1";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job,
            linkedWB1FileName, null);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb1a1);
    writer.write(null, wb1b1);
    writer.write(null, wb1c1);
    writer.close(reporter);
    // write linkedworkbook2
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO wb2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO wb2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO wb2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet1");
    // write
    job = new JobConf(defaultConf);
    String linkedWB2FileName = "excel2013linkedwb2";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    outputFormat = new ExcelFileOutputFormat();
    writer = outputFormat.getRecordWriter(null, job, linkedWB2FileName, null);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, wb2a1);
    writer.write(null, wb2b1);
    writer.write(null, wb2c1);
    writer.close(reporter);
    // write mainworkbook
    String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0"
            + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName
            + ".xlsx]:[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + "_temporary" + File.separator + attempt + File.separator + linkedWB2FileName + ".xlsx]";
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test7", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB1FileName + ".xlsx]Sheet1'!B1", "B1",
            "Sheet1"); // should be test2 in the end
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB2FileName + ".xlsx]Sheet1'!B1", "B1",
            "Sheet1"); // should be test5 in the end
    // write
    job = new JobConf(defaultConf);
    String mainWBfileName = "excel2013singlesheetlinkedwbtestout";
    outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename);
    outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job,
            mainWBfileName, null);
    assertNotNull(writerMain, "Format returned  null RecordWriter");
    writerMain.write(null, a1);
    writerMain.write(null, b1);
    writerMain.write(null, c1);
    writerMain.close(reporter);
    // try to read it again
    job = new JobConf(defaultConf);
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xlsx");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    inputFormat.configure(job);
    InputSplit[] inputSplits = inputFormat.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue),
            "Input Split for Excel file contains row 1 Sheet1");
    assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\"");
    assertEquals(2, spreadSheetValue.get().length,
            "Input Split for Excel file contains row 1 with 2 columns for Sheet1");
    assertEquals("test7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test7\"");
    // this comes from the external workbook
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test5\"");
}

From source file:sa.edu.kaust.twitter.preprocess.spam.RemoveTweetsOfSpamUsers.java

License:Apache License

public static int removeTweetsOfSpamUsers(String inputPath, String outputPath, int numReducers,
        String spamUserListFile, long startID, long endID, String nTweetsFile, Boolean spam) throws Exception {
    sLogger.info("input: " + inputPath);
    sLogger.info("output dir: " + outputPath);
    sLogger.info("spam user list file: " + spamUserListFile);

    JobConf conf = new JobConf(RemoveTweetsOfSpamUsers.class);
    FileSystem fs = FileSystem.get(conf);
    conf.setJobName("RemoveSpamUserTweets");
    conf.setLong("startID", startID);
    conf.setLong("endID", endID);
    conf.setNumReduceTasks(numReducers);
    conf.setBoolean("spam", spam);

    // put the mapping file in the distributed cache so each map worker will
    // have it/* w  w w .j  a v a  2s .  co  m*/
    //DistributedCache.addCacheFile(new URI(mappingFile), conf);

    if (conf.get("mapred.job.tracker").equals("local")) {
        conf.set("SpamUserListFile", spamUserListFile);
    } else {
        DistributedCache.addCacheFile(new URI(spamUserListFile), conf);
    }

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(LongWritable.class);
    conf.setOutputValueClass(TweetWritable.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // delete the output directory if it exists already
    //FileSystem.get(conf).delete(new Path(outputPath), true);
    if (fs.exists(new Path(outputPath))) {
        sLogger.info("Output already exists: skipping!");
        return FSProperty.readInt(fs, nTweetsFile);
    }

    RunningJob job = JobClient.runJob(conf);
    Counters counters = job.getCounters();
    int nonSpamTweets = (int) counters.findCounter(Statistics.NON_SPAM_TWEETS).getCounter();
    FSProperty.writeInt(fs, nTweetsFile, nonSpamTweets);
    sLogger.info("num of non-spam tweets: " + nonSpamTweets);
    return nonSpamTweets;
}

From source file:StorageEngineClient.FormatStorageInputFormat_SplitByLineNum.java

License:Open Source License

public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    List<FormatStorageInputSplit_WithLineNum> splits = new ArrayList<FormatStorageInputSplit_WithLineNum>();

    int lenNum = job.getInt("hive.inputfiles.line_num_per_split", 1000000);
    if (lenNum < 10000) {
        LOG.info("lenNum been set to " + lenNum + " is too small, so set it to 1000000");
        lenNum = 1000000;//  w  w w  .  j  a va  2 s. com
    }
    FileStatus[] fss = listStatus(job);

    FileStatus[] orignalFss = fss;
    List<FileStatus> fssList = new ArrayList<FileStatus>();
    for (int i = 0; i < fss.length; i++) {
        if (fss[i].getLen() > 0) {
            fssList.add(fss[i]);
        }
    }

    fss = (FileStatus[]) fssList.toArray(new FileStatus[0]);
    int listSize = fss.length;

    if (listSize == 0) {

        mapredWork mrWork = Utilities.getMapRedWork(job);
        Path inputPath = orignalFss[0].getPath();
        Path inputParentPath = inputPath.getParent();
        String inputPathStr = inputPath.toUri().toString();
        String inputPathParentStr = inputParentPath.toString();

        FileSystem fs = inputPath.getFileSystem(job);
        fs.delete(inputPath, true);

        LinkedHashMap<String, partitionDesc> partDescMap = mrWork.getPathToPartitionInfo();
        partitionDesc partDesc = partDescMap.get(inputPathParentStr);

        job.setBoolean("NeedPostfix", false);
        RecordWriter recWriter = new FormatStorageHiveOutputFormat().getHiveRecordWriter(job, inputPath,
                Text.class, false, partDesc.getTableDesc().getProperties(), null);
        recWriter.close(false);
        job.setBoolean("NeedPostfix", true);

        fss = listStatus(job);
    }

    Random r = new Random(123456);
    for (int i = 0; i < fss.length; i++) {
        int x = r.nextInt(fss.length);
        FileStatus tmp = fss[i];
        fss[i] = fss[x];
        fss[x] = tmp;
    }
    int[] fslengths = new int[fss.length];
    for (int i = 0; i < fss.length; i++) {
        IFormatDataFile ifdf = new IFormatDataFile(job);
        ifdf.open(fss[i].getPath().toString());
        fslengths[i] = ifdf.recnum();
        ifdf.close();
    }

    int id = 0;
    int offset = 0;
    int currlen = 0;
    ArrayList<FileSplit> currFileSplits = new ArrayList<FormatStorageInputFormat_SplitByLineNum.FileSplit>();
    while (true) {
        int need = lenNum - currlen;
        int remain = fslengths[id] - offset;

        if (need <= remain) {
            currFileSplits.add(new FileSplit(fss[id].getPath().toString(), offset, need));
            splits.add(new FormatStorageInputSplit_WithLineNum(
                    currFileSplits.toArray(new FileSplit[currFileSplits.size()]),
                    fss[id].getPath().getFileSystem(job).getFileBlockLocations(fss[id], 0, fss[id].getLen())[0]
                            .getHosts()));
            currFileSplits.clear();

            currlen = 0;

            offset += need;
        } else {
            if (remain != 0) {
                currFileSplits.add(new FileSplit(fss[id].getPath().toString(), offset, remain));
            }
            id++;
            offset = 0;
            currlen += remain;
        }

        if (id == fss.length) {
            if (currFileSplits.size() != 0) {
                splits.add(new FormatStorageInputSplit_WithLineNum(
                        currFileSplits.toArray(new FileSplit[currFileSplits.size()]),
                        fss[id - 1].getPath().getFileSystem(job).getFileBlockLocations(fss[id - 1], 0,
                                fss[id - 1].getLen())[0].getHosts()));
            }
            break;
        }
    }

    if (splits.size() == 0) {
        ArrayList<FileSplit> emptyFileSplits = new ArrayList<FormatStorageInputFormat_SplitByLineNum.FileSplit>();
        emptyFileSplits.add(new FileSplit(fss[0].getPath().toString(), 0, 0));

        splits.add(new FormatStorageInputSplit_WithLineNum(
                emptyFileSplits.toArray(new FileSplit[emptyFileSplits.size()]),
                fss[0].getPath().getFileSystem(job).getFileBlockLocations(fss[0], 0, fss[0].getLen())[0]
                        .getHosts()));
    }

    for (int i = 0; i < splits.size(); i++) {
        LOG.info(splits.get(i).toString());
    }

    LOG.info("Total # of splits: " + splits.size());
    return splits.toArray(new FormatStorageInputSplit_WithLineNum[splits.size()]);

}

From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGenerator.java

License:Open Source License

/**
 * /*from ww w . j  a v  a2  s  . com*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Store application properties where the mappers/reducers can access
    // them
    Config index_conf;
    if (this.configPath != null) {
        index_conf = ConfigFactory.parseFile(new File(this.configPath));
    } else {
        index_conf = ConfigFactory.load();
    }
    if (this.dumpConfig) {
        ConfigPrinter.print(index_conf);
        System.exit(0);
    }
    // Decide whether to apply annotations:
    // Store the properties:
    conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
    LOG.info("Loaded warc config.");
    LOG.info(index_conf.getString("warc.title"));

    // Reducer count
    int numReducers = 1;
    try {
        numReducers = index_conf.getInt("warc.hadoop.num_reducers");
    } catch (NumberFormatException n) {
        numReducers = 10;
    }

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCDatasetMapper.class);
    conf.setReducerClass(FrequencyCountingReducer.class);
    // This can be optionally use to suppress keys:
    // conf.setOutputFormat(KeylessTextOutputFormat.class);
    // conf.set( "map.output.key.field.separator", "" );

    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);

    MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, TextOutputFormat.class, Text.class,
            Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, HOSTS_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class);
    MultipleOutputs.addMultiNamedOutput(conf, FACES_NAME, TextOutputFormat.class, Text.class, Text.class);

}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGenerator.java

License:Open Source License

/**
 * /*from w  ww .  j a  v a 2 s.  c  om*/
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Store application properties where the mappers/reducers can access
    // them
    Config index_conf;
    if (this.configPath != null) {
        LOG.info("Loading config from: " + configPath);
        index_conf = ConfigFactory.parseFile(new File(this.configPath));
    } else {
        LOG.info("Using default config: mdx");
        index_conf = ConfigFactory.load("mdx");
    }
    if (this.dumpConfig) {
        ConfigPrinter.print(index_conf);
        System.exit(0);
    }
    conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
    LOG.info("Loaded warc config: " + index_conf.getString("warc.title"));

    // Reducer count:
    int numReducers = 10;
    if (index_conf.hasPath(WARC_HADOOP_NUM_REDUCERS)) {
        numReducers = index_conf.getInt(WARC_HADOOP_NUM_REDUCERS);
    }
    if (conf.getInt(WARC_HADOOP_NUM_REDUCERS, -1) != -1) {
        LOG.info("Overriding num_reducers using Hadoop config.");
        numReducers = conf.getInt(WARC_HADOOP_NUM_REDUCERS, numReducers);
    }

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCMDXMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    // conf.setOutputFormat(TextOutputFormat.class);
    // SequenceFileOutputFormat.setOutputCompressionType(conf,
    // CompressionType.BLOCK);
    // OR TextOutputFormat?
    // conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunner.java

License:Open Source License

/**
 * //from  w  w  w  . ja v a2s. c om
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
protected void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Store application properties where the mappers/reducers can access
    // them
    Config index_conf;
    if (this.configPath != null) {
        index_conf = ConfigFactory.parseFile(new File(this.configPath));
    } else {
        index_conf = ConfigFactory.load();
    }
    if (this.dumpConfig) {
        ConfigPrinter.print(index_conf);
        System.exit(0);
    }
    // Decide whether to apply annotations:
    index_conf = index_conf.withValue(CONFIG_APPLY_ANNOTATIONS,
            ConfigValueFactory.fromAnyRef(applyAnnotations));
    // Store the properties:
    conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));
    LOG.info("Loaded warc config.");
    LOG.info(index_conf.getString("warc.title"));
    if (index_conf.getBoolean("warc.solr.use_hash_url_id")) {
        LOG.info("Using hash-based ID.");
    }
    if (index_conf.hasPath("warc.solr.zookeepers")) {
        LOG.info("Using Zookeepers.");
    } else {
        LOG.info("Using SolrServers.");
    }

    // Also set reduce speculative execution off, avoiding duplicate
    // submissions to Solr.
    conf.set("mapred.reduce.tasks.speculative.execution", "false");

    // Reducer count dependent on concurrent HTTP connections to Solr
    // server.
    int numReducers = 1;
    try {
        numReducers = index_conf.getInt("warc.hadoop.num_reducers");
    } catch (NumberFormatException n) {
        numReducers = 10;
    }

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(WARCIndexerMapper.class);
    conf.setReducerClass(WARCIndexerReducer.class);
    conf.setOutputFormat(KeylessTextOutputFormat.class);
    conf.set("map.output.key.field.separator", "");
    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
    // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

    conf.setBoolean("mapred.output.oai-pmh", this.exportXml);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WritableSolrRecord.class);
    conf.setNumReduceTasks(numReducers);
}

From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXMerger.java

License:Open Source License

/**
 * //from ww w. j  ava2  s .  co m
 * @param args
 * @return
 * @throws IOException
 * @throws ParseException
 * @throws InterruptedException
 * @throws KeeperException
 */
public void createJobConf(JobConf conf, String[] args)
        throws IOException, ParseException, KeeperException, InterruptedException {
    // Parse the command-line parameters.
    this.setup(args, conf);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(this.inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    FileOutputFormat.setOutputPath(conf, new Path(this.outputPath));

    conf.setJobName(this.inputPath + "_" + System.currentTimeMillis());
    // Input
    conf.setInputFormat(TextInputFormat.class);
    // M-R
    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MDXReduplicatingReducer.class);
    // Map outputs
    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(Text.class);
    // Job outputs
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(TextOutputFormat.class);
    LOG.info("Used " + numReducers + " reducers.");
    conf.setNumReduceTasks(numReducers);

    // Compress the output from the maps, to cut down temp space
    // requirements between map and reduce.
    conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax
                                                            // for 0.20.x ?
    conf.set("mapred.compress.map.output", "true");
    // conf.set("mapred.map.output.compression.codec",
    // "org.apache.hadoop.io.compress.GzipCodec");
    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.task.classpath.user.precedence", true);

}