List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Test public void writeExcelOutputFormatExcel2003SingleSheetOneLinkedWorkbook() throws IOException { // write linkedworkbook1 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1"); // write/*from w w w . jav a2 s.co m*/ JobConf job = new JobConf(defaultConf); String linkedWB1FileName = "excel2003linkedwb1"; String tmpDir = tmpPath.toString(); Path outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job, linkedWB1FileName, null); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb1a1); writer.write(null, wb1b1); writer.write(null, wb1c1); writer.close(reporter); // write mainworkbook String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName + ".xls]"; SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1"); SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xls]Sheet1!B1", "B1", "Sheet1"); // should be test2 in the end // write job = new JobConf(defaultConf); String mainWBfileName = "excel2003singlesheetlinkedwbtestout"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename); outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job, mainWBfileName, null); assertNotNull(writerMain, "Format returned null RecordWriter"); writerMain.write(null, a1); writerMain.write(null, b1); writerMain.close(reporter); // try to read it again job = new JobConf(defaultConf); Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xls"); FileInputFormat.setInputPaths(job, inputFile); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat inputFormat = new ExcelFileInputFormat(); inputFormat.configure(job); InputSplit[] inputSplits = inputFormat.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1 Sheet1"); assertEquals("[" + mainWBfileName + ".xls]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xls]Sheet1!A1\""); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns for Sheet1"); assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test4\""); // this comes from the external workbook assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Test public void writeExcelOutputFormatExcel2003SingleSheetTwoLinkedWorkbooks() throws IOException { // write linkedworkbook1 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1"); // write//from www . j a va 2s . co m JobConf job = new JobConf(defaultConf); String linkedWB1FileName = "excel2003linkedwb1b"; String tmpDir = tmpPath.toString(); Path outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job, linkedWB1FileName, null); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb1a1); writer.write(null, wb1b1); writer.write(null, wb1c1); writer.close(reporter); // write linkedworkbook2 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet1"); // write job = new JobConf(defaultConf); String linkedWB2FileName = "excel2003linkedwb2b"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); outputFormat = new ExcelFileOutputFormat(); writer = outputFormat.getRecordWriter(null, job, linkedWB2FileName, null); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb2a1); writer.write(null, wb2b1); writer.write(null, wb2c1); writer.close(reporter); // write mainworkbook String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName + ".xls]:[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB2FileName + ".xls]"; SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test7", "", "", "A1", "Sheet1"); SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xls]Sheet1!B1", "B1", "Sheet1"); // should be test2 in the end SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("", "", "[" + linkedWB2FileName + ".xls]Sheet1!B1", "C1", "Sheet1"); // should be test5 in the end // write job = new JobConf(defaultConf); String mainWBfileName = "excel2003singlesheetlinkedwb2testout"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.ms-excel"); job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename); outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job, mainWBfileName, null); assertNotNull(writerMain, "Format returned null RecordWriter"); writerMain.write(null, a1); writerMain.write(null, b1); writerMain.write(null, c1); writerMain.close(reporter); // try to read it again job = new JobConf(defaultConf); Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xls"); FileInputFormat.setInputPaths(job, inputFile); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat inputFormat = new ExcelFileInputFormat(); inputFormat.configure(job); InputSplit[] inputSplits = inputFormat.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1 Sheet1"); assertEquals("[" + mainWBfileName + ".xls]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xls]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns for Sheet1"); assertEquals("test7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test7\""); // this comes from the external workbook assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test5\""); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184") @Test//w w w . j ava2 s.c o m public void writeExcelOutputFormatExcel2013SingleSheetOneLinkedWorkbook() throws IOException { // write linkedworkbook1 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1"); // write JobConf job = new JobConf(defaultConf); String linkedWB1FileName = "excel2013linkedwb1"; String tmpDir = tmpPath.toString(); Path outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job, linkedWB1FileName, null); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb1a1); writer.write(null, wb1b1); writer.write(null, wb1c1); writer.close(reporter); // write mainworkbook String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName + ".xlsx]"; SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1"); SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "[" + linkedWB1FileName + ".xlsx]Sheet1!B1", "B1", "Sheet1"); // should be test2 in the end // write job = new JobConf(defaultConf); String mainWBfileName = "excel2013singlesheetlinkedwbtestout"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename); outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job, mainWBfileName, null); assertNotNull(writerMain, "Format returned null RecordWriter"); writerMain.write(null, a1); writerMain.write(null, b1); writerMain.close(reporter); // try to read it again job = new JobConf(defaultConf); Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xlsx"); FileInputFormat.setInputPaths(job, inputFile); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat inputFormat = new ExcelFileInputFormat(); inputFormat.configure(job); InputSplit[] inputSplits = inputFormat.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1 Sheet1"); assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\""); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns for Sheet1"); assertEquals("test4", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test4\""); // this comes from the external workbook assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Disabled("This does not work yet due to a bug in Apache POI that prevents writing correct workbooks containing external references: https://bz.apache.org/bugzilla/show_bug.cgi?id=57184") @Test//ww w .ja v a2 s. c om public void writeExcelOutputFormatExcel2013SingleSheetTwoLinkedWorkbooks() throws IOException { // write linkedworkbook1 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb1a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb1b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb1c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1"); // write JobConf job = new JobConf(defaultConf); String linkedWB1FileName = "excel2013linkedwb1"; String tmpDir = tmpPath.toString(); Path outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job, linkedWB1FileName, null); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb1a1); writer.write(null, wb1b1); writer.write(null, wb1c1); writer.close(reporter); // write linkedworkbook2 // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO wb2a1 = new SpreadSheetCellDAO("test4", "", "", "A1", "Sheet1"); SpreadSheetCellDAO wb2b1 = new SpreadSheetCellDAO("test5", "", "", "B1", "Sheet1"); SpreadSheetCellDAO wb2c1 = new SpreadSheetCellDAO("test6", "", "", "C1", "Sheet1"); // write job = new JobConf(defaultConf); String linkedWB2FileName = "excel2013linkedwb2"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes outputFormat = new ExcelFileOutputFormat(); writer = outputFormat.getRecordWriter(null, job, linkedWB2FileName, null); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, wb2a1); writer.write(null, wb2b1); writer.write(null, wb2c1); writer.close(reporter); // write mainworkbook String linkedWorkbookFilename = "[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB1FileName + ".xlsx]:[" + tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + linkedWB2FileName + ".xlsx]"; SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test7", "", "", "A1", "Sheet1"); SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB1FileName + ".xlsx]Sheet1'!B1", "B1", "Sheet1"); // should be test2 in the end SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("", "", "'[" + linkedWB2FileName + ".xlsx]Sheet1'!B1", "B1", "Sheet1"); // should be test5 in the end // write job = new JobConf(defaultConf); String mainWBfileName = "excel2013singlesheetlinkedwbtestout"; outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes job.set("hadoopoffice.write.linkedworkbooks", linkedWorkbookFilename); outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writerMain = outputFormat.getRecordWriter(null, job, mainWBfileName, null); assertNotNull(writerMain, "Format returned null RecordWriter"); writerMain.write(null, a1); writerMain.write(null, b1); writerMain.write(null, c1); writerMain.close(reporter); // try to read it again job = new JobConf(defaultConf); Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + mainWBfileName + ".xlsx"); FileInputFormat.setInputPaths(job, inputFile); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat inputFormat = new ExcelFileInputFormat(); inputFormat.configure(job); InputSplit[] inputSplits = inputFormat.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1 Sheet1"); assertEquals("[" + mainWBfileName + ".xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[" + mainWBfileName + ".xlsx]Sheet1!A1\""); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns for Sheet1"); assertEquals("test7", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test7\""); // this comes from the external workbook assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test5", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test5\""); }
From source file:sa.edu.kaust.twitter.preprocess.spam.RemoveTweetsOfSpamUsers.java
License:Apache License
public static int removeTweetsOfSpamUsers(String inputPath, String outputPath, int numReducers, String spamUserListFile, long startID, long endID, String nTweetsFile, Boolean spam) throws Exception { sLogger.info("input: " + inputPath); sLogger.info("output dir: " + outputPath); sLogger.info("spam user list file: " + spamUserListFile); JobConf conf = new JobConf(RemoveTweetsOfSpamUsers.class); FileSystem fs = FileSystem.get(conf); conf.setJobName("RemoveSpamUserTweets"); conf.setLong("startID", startID); conf.setLong("endID", endID); conf.setNumReduceTasks(numReducers); conf.setBoolean("spam", spam); // put the mapping file in the distributed cache so each map worker will // have it/* w w w .j a v a 2s . co m*/ //DistributedCache.addCacheFile(new URI(mappingFile), conf); if (conf.get("mapred.job.tracker").equals("local")) { conf.set("SpamUserListFile", spamUserListFile); } else { DistributedCache.addCacheFile(new URI(spamUserListFile), conf); } FileInputFormat.setInputPaths(conf, new Path(inputPath)); FileOutputFormat.setOutputPath(conf, new Path(outputPath)); FileOutputFormat.setCompressOutput(conf, false); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(LongWritable.class); conf.setOutputValueClass(TweetWritable.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); // delete the output directory if it exists already //FileSystem.get(conf).delete(new Path(outputPath), true); if (fs.exists(new Path(outputPath))) { sLogger.info("Output already exists: skipping!"); return FSProperty.readInt(fs, nTweetsFile); } RunningJob job = JobClient.runJob(conf); Counters counters = job.getCounters(); int nonSpamTweets = (int) counters.findCounter(Statistics.NON_SPAM_TWEETS).getCounter(); FSProperty.writeInt(fs, nTweetsFile, nonSpamTweets); sLogger.info("num of non-spam tweets: " + nonSpamTweets); return nonSpamTweets; }
From source file:StorageEngineClient.FormatStorageInputFormat_SplitByLineNum.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { List<FormatStorageInputSplit_WithLineNum> splits = new ArrayList<FormatStorageInputSplit_WithLineNum>(); int lenNum = job.getInt("hive.inputfiles.line_num_per_split", 1000000); if (lenNum < 10000) { LOG.info("lenNum been set to " + lenNum + " is too small, so set it to 1000000"); lenNum = 1000000;// w w w . j a va 2 s. com } FileStatus[] fss = listStatus(job); FileStatus[] orignalFss = fss; List<FileStatus> fssList = new ArrayList<FileStatus>(); for (int i = 0; i < fss.length; i++) { if (fss[i].getLen() > 0) { fssList.add(fss[i]); } } fss = (FileStatus[]) fssList.toArray(new FileStatus[0]); int listSize = fss.length; if (listSize == 0) { mapredWork mrWork = Utilities.getMapRedWork(job); Path inputPath = orignalFss[0].getPath(); Path inputParentPath = inputPath.getParent(); String inputPathStr = inputPath.toUri().toString(); String inputPathParentStr = inputParentPath.toString(); FileSystem fs = inputPath.getFileSystem(job); fs.delete(inputPath, true); LinkedHashMap<String, partitionDesc> partDescMap = mrWork.getPathToPartitionInfo(); partitionDesc partDesc = partDescMap.get(inputPathParentStr); job.setBoolean("NeedPostfix", false); RecordWriter recWriter = new FormatStorageHiveOutputFormat().getHiveRecordWriter(job, inputPath, Text.class, false, partDesc.getTableDesc().getProperties(), null); recWriter.close(false); job.setBoolean("NeedPostfix", true); fss = listStatus(job); } Random r = new Random(123456); for (int i = 0; i < fss.length; i++) { int x = r.nextInt(fss.length); FileStatus tmp = fss[i]; fss[i] = fss[x]; fss[x] = tmp; } int[] fslengths = new int[fss.length]; for (int i = 0; i < fss.length; i++) { IFormatDataFile ifdf = new IFormatDataFile(job); ifdf.open(fss[i].getPath().toString()); fslengths[i] = ifdf.recnum(); ifdf.close(); } int id = 0; int offset = 0; int currlen = 0; ArrayList<FileSplit> currFileSplits = new ArrayList<FormatStorageInputFormat_SplitByLineNum.FileSplit>(); while (true) { int need = lenNum - currlen; int remain = fslengths[id] - offset; if (need <= remain) { currFileSplits.add(new FileSplit(fss[id].getPath().toString(), offset, need)); splits.add(new FormatStorageInputSplit_WithLineNum( currFileSplits.toArray(new FileSplit[currFileSplits.size()]), fss[id].getPath().getFileSystem(job).getFileBlockLocations(fss[id], 0, fss[id].getLen())[0] .getHosts())); currFileSplits.clear(); currlen = 0; offset += need; } else { if (remain != 0) { currFileSplits.add(new FileSplit(fss[id].getPath().toString(), offset, remain)); } id++; offset = 0; currlen += remain; } if (id == fss.length) { if (currFileSplits.size() != 0) { splits.add(new FormatStorageInputSplit_WithLineNum( currFileSplits.toArray(new FileSplit[currFileSplits.size()]), fss[id - 1].getPath().getFileSystem(job).getFileBlockLocations(fss[id - 1], 0, fss[id - 1].getLen())[0].getHosts())); } break; } } if (splits.size() == 0) { ArrayList<FileSplit> emptyFileSplits = new ArrayList<FormatStorageInputFormat_SplitByLineNum.FileSplit>(); emptyFileSplits.add(new FileSplit(fss[0].getPath().toString(), 0, 0)); splits.add(new FormatStorageInputSplit_WithLineNum( emptyFileSplits.toArray(new FileSplit[emptyFileSplits.size()]), fss[0].getPath().getFileSystem(job).getFileBlockLocations(fss[0], 0, fss[0].getLen())[0] .getHosts())); } for (int i = 0; i < splits.size(); i++) { LOG.info(splits.get(i).toString()); } LOG.info("Total # of splits: " + splits.size()); return splits.toArray(new FormatStorageInputSplit_WithLineNum[splits.size()]); }
From source file:uk.bl.wa.hadoop.datasets.WARCDatasetGenerator.java
License:Open Source License
/** * /*from ww w . j a v a2 s . com*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { index_conf = ConfigFactory.load(); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } // Decide whether to apply annotations: // Store the properties: conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config."); LOG.info(index_conf.getString("warc.title")); // Reducer count int numReducers = 1; try { numReducers = index_conf.getInt("warc.hadoop.num_reducers"); } catch (NumberFormatException n) { numReducers = 10; } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCDatasetMapper.class); conf.setReducerClass(FrequencyCountingReducer.class); // This can be optionally use to suppress keys: // conf.setOutputFormat(KeylessTextOutputFormat.class); // conf.set( "map.output.key.field.separator", "" ); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FORMATS_FFB_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOSTS_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, HOST_LINKS_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, GEO_SUMMARY_NAME, TextOutputFormat.class, Text.class, Text.class); MultipleOutputs.addMultiNamedOutput(conf, FACES_NAME, TextOutputFormat.class, Text.class, Text.class); }
From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGenerator.java
License:Open Source License
/** * /*from w ww . j a v a 2 s. c om*/ * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { LOG.info("Loading config from: " + configPath); index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { LOG.info("Using default config: mdx"); index_conf = ConfigFactory.load("mdx"); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config: " + index_conf.getString("warc.title")); // Reducer count: int numReducers = 10; if (index_conf.hasPath(WARC_HADOOP_NUM_REDUCERS)) { numReducers = index_conf.getInt(WARC_HADOOP_NUM_REDUCERS); } if (conf.getInt(WARC_HADOOP_NUM_REDUCERS, -1) != -1) { LOG.info("Overriding num_reducers using Hadoop config."); numReducers = conf.getInt(WARC_HADOOP_NUM_REDUCERS, numReducers); } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCMDXMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); conf.setOutputFormat(SequenceFileOutputFormat.class); // conf.setOutputFormat(TextOutputFormat.class); // SequenceFileOutputFormat.setOutputCompressionType(conf, // CompressionType.BLOCK); // OR TextOutputFormat? // conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); conf.setNumReduceTasks(numReducers); }
From source file:uk.bl.wa.hadoop.indexer.WARCIndexerRunner.java
License:Open Source License
/** * //from w w w . ja v a2s. c om * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ protected void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Store application properties where the mappers/reducers can access // them Config index_conf; if (this.configPath != null) { index_conf = ConfigFactory.parseFile(new File(this.configPath)); } else { index_conf = ConfigFactory.load(); } if (this.dumpConfig) { ConfigPrinter.print(index_conf); System.exit(0); } // Decide whether to apply annotations: index_conf = index_conf.withValue(CONFIG_APPLY_ANNOTATIONS, ConfigValueFactory.fromAnyRef(applyAnnotations)); // Store the properties: conf.set(CONFIG_PROPERTIES, index_conf.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); LOG.info("Loaded warc config."); LOG.info(index_conf.getString("warc.title")); if (index_conf.getBoolean("warc.solr.use_hash_url_id")) { LOG.info("Using hash-based ID."); } if (index_conf.hasPath("warc.solr.zookeepers")) { LOG.info("Using Zookeepers."); } else { LOG.info("Using SolrServers."); } // Also set reduce speculative execution off, avoiding duplicate // submissions to Solr. conf.set("mapred.reduce.tasks.speculative.execution", "false"); // Reducer count dependent on concurrent HTTP connections to Solr // server. int numReducers = 1; try { numReducers = index_conf.getInt("warc.hadoop.num_reducers"); } catch (NumberFormatException n) { numReducers = 10; } // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(WARCIndexerMapper.class); conf.setReducerClass(WARCIndexerReducer.class); conf.setOutputFormat(KeylessTextOutputFormat.class); conf.set("map.output.key.field.separator", ""); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); conf.setBoolean("mapred.output.oai-pmh", this.exportXml); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WritableSolrRecord.class); conf.setNumReduceTasks(numReducers); }
From source file:uk.bl.wa.hadoop.mapreduce.mdx.MDXMerger.java
License:Open Source License
/** * //from ww w. j ava2 s . co m * @param args * @return * @throws IOException * @throws ParseException * @throws InterruptedException * @throws KeeperException */ public void createJobConf(JobConf conf, String[] args) throws IOException, ParseException, KeeperException, InterruptedException { // Parse the command-line parameters. this.setup(args, conf); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(this.inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); FileOutputFormat.setOutputPath(conf, new Path(this.outputPath)); conf.setJobName(this.inputPath + "_" + System.currentTimeMillis()); // Input conf.setInputFormat(TextInputFormat.class); // M-R conf.setMapperClass(IdentityMapper.class); conf.setReducerClass(MDXReduplicatingReducer.class); // Map outputs conf.setMapOutputKeyClass(Text.class); conf.setMapOutputValueClass(Text.class); // Job outputs conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(TextOutputFormat.class); LOG.info("Used " + numReducers + " reducers."); conf.setNumReduceTasks(numReducers); // Compress the output from the maps, to cut down temp space // requirements between map and reduce. conf.setBoolean("mapreduce.map.output.compress", true); // Wrong syntax // for 0.20.x ? conf.set("mapred.compress.map.output", "true"); // conf.set("mapred.map.output.compression.codec", // "org.apache.hadoop.io.compress.GzipCodec"); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.task.classpath.user.precedence", true); }