List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:org.terrier.applications.HadoopIndexing.java
License:Mozilla Public License
/** Starts the MapReduce indexing. * @param args// w w w. jav a 2 s . co m * @throws Exception */ public static void main(String[] args) throws Exception { long time = System.currentTimeMillis(); boolean docPartitioned = false; int numberOfReducers = Integer .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26")); final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing"); if (args.length == 2 && args[0].equals("-p")) { logger.info("Document-partitioned Mode, " + numberOfReducers + " output indices."); numberOfReducers = Integer.parseInt(args[1]); docPartitioned = true; } else if (args.length == 1 && args[0].equals("--merge")) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); else logger.error("No point merging 1 reduce task output"); return; } else if (args.length == 0) { logger.info("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index."); docPartitioned = false; if (numberOfReducers > MAX_REDUCE) { logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use " + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most"); } } else { logger.fatal(usage()); return; } if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0], false) instanceof BitCompressionConfiguration)) { logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing" + " - you can recompress the inverted index later using IndexRecompressor"); return; } if (jf == null) throw new Exception("Could not get JobFactory from HadoopPlugin"); final JobConf conf = jf.newJob(); conf.setJobName("terrierIndexing"); if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH) && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) { logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + "," + ApplicationSetup.TERRIER_INDEX_PREFIX); return; } boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING; if (blockIndexing) { conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class); conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class); } else { conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class); conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class); } FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH)); conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX); conf.setMapOutputKeyClass(SplitEmittedTerm.class); conf.setMapOutputValueClass(MapEmittedPostingList.class); conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned); if (!conf.get("mapred.job.tracker").equals("local")) { conf.setMapOutputCompressorClass(GzipCodec.class); conf.setCompressMapOutput(true); } else { conf.setCompressMapOutput(false); } conf.setInputFormat(MultiFileCollectionInputFormat.class); conf.setOutputFormat(NullOutputFormat.class); conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class); conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class); conf.setReduceSpeculativeExecution(false); //parse the collection.spec BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC); String line = null; List<Path> paths = new ArrayList<Path>(); while ((line = specBR.readLine()) != null) { if (line.startsWith("#")) continue; paths.add(new Path(line)); } specBR.close(); FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()])); conf.setNumReduceTasks(numberOfReducers); if (numberOfReducers > 1) { if (docPartitioned) conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class); else conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class); } else { //for JUnit tests, we seem to need to restore the original partitioner class conf.setPartitionerClass(HashPartitioner.class); } JobID jobId = null; boolean ranOK = true; try { RunningJob rj = JobClient.runJob(conf); jobId = rj.getID(); HadoopUtility.finishTerrierJob(conf); } catch (Exception e) { logger.error("Problem running job", e); ranOK = false; } if (jobId != null) { deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId); } if (ranOK) { if (!docPartitioned) { if (numberOfReducers > 1) mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers); } Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH, docPartitioned ? numberOfReducers : 1, jf); } System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds"); jf.close(); }
From source file:org.terrier.utility.io.HadoopUtility.java
License:Mozilla Public License
protected static void saveClassPathToJob(JobConf jobConf) throws IOException { logger.info("Copying classpath to job"); if (jobConf.getBoolean("terrier.classpath.copied", false)) { return;/*from ww w . j a v a 2 s . co m*/ } jobConf.setBoolean("terrier.classpath.copied", true); final String[] jars = findJarFiles( new String[] { System.getenv().get("CLASSPATH"), System.getProperty("java.class.path") }); final FileSystem defFS = FileSystem.get(jobConf); for (String jarFile : jars) { //logger.debug("Adding " + jarFile + " to job class path"); Path srcJarFilePath = new Path("file:///" + jarFile); String filename = srcJarFilePath.getName(); Path tmpJarFilePath = makeTemporaryFile(jobConf, filename); defFS.copyFromLocalFile(srcJarFilePath, tmpJarFilePath); DistributedCache.addFileToClassPath(tmpJarFilePath, jobConf); } DistributedCache.createSymlink(jobConf); }
From source file:org.warcbase.index.IndexerReducer.java
License:Apache License
@Override public void configure(JobConf job) { LOG.info("Configuring reducer..."); // Initialize the embedded server. try {//from w ww . j a va 2 s.c om job.setBoolean("fs.hdfs.impl.disable.cache", true); fs = FileSystem.get(job); solrHome = Solate.findSolrConfig(job, IndexerRunner.solrHomeZipName); LOG.info("Found solrHomeDir " + solrHome); } catch (IOException e) { e.printStackTrace(); LOG.error("FAILED in reducer configuration: " + e); } outputDir = new Path(job.get(HDFS_OUTPUT_PATH)); LOG.info("HDFS index output path: " + outputDir); LOG.info("Initialization complete."); }
From source file:org.warcbase.index.IndexerRunner.java
License:Apache License
@SuppressWarnings("static-access") public int run(String[] args) throws IOException, ParseException { LOG.info("Initializing indexer..."); Options options = new Options(); options.addOption(/* www. j av a 2 s . co m*/ OptionBuilder.withArgName("file").hasArg().withDescription("input file list").create(INPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS index output path") .create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of shards") .create(SHARDS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("config file (optional)") .create(CONFIG_OPTION)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION) || !cmdline.hasOption(SHARDS_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String configPath = null; if (cmdline.hasOption(CONFIG_OPTION)) { configPath = cmdline.getOptionValue(CONFIG_OPTION); } String inputPath = cmdline.getOptionValue(INPUT_OPTION); String outputPath = cmdline.getOptionValue(INDEX_OPTION); int shards = Integer.parseInt(cmdline.getOptionValue(SHARDS_OPTION)); JobConf conf = new JobConf(getConf(), IndexerRunner.class); if (configPath == null) { LOG.info("Config not specified, using default src/main/solr/WARCIndexer.conf"); configPath = "src/main/solr/WARCIndexer.conf"; } File configFile = new File(configPath); if (!configFile.exists()) { LOG.error("Error: config does not exist!"); System.exit(-1); } Config config = ConfigFactory.parseFile(configFile); conf.set(CONFIG_PROPERTIES, config.withOnlyPath("warc").root().render(ConfigRenderOptions.concise())); FileSystem fs = FileSystem.get(conf); LOG.info("HDFS index output path: " + outputPath); conf.set(IndexerReducer.HDFS_OUTPUT_PATH, outputPath); if (fs.exists(new Path(outputPath))) { LOG.error("Error: path exists already!"); System.exit(-1); } LOG.info("Number of shards: " + shards); conf.setInt(IndexerMapper.NUM_SHARDS, shards); // Add input paths: LOG.info("Reading input files..."); String line = null; BufferedReader br = new BufferedReader(new FileReader(inputPath)); while ((line = br.readLine()) != null) { FileInputFormat.addInputPath(conf, new Path(line)); } br.close(); LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files."); conf.setJobName(IndexerRunner.class.getSimpleName() + ": " + inputPath); conf.setInputFormat(ArchiveFileInputFormat.class); conf.setMapperClass(IndexerMapper.class); conf.setReducerClass(IndexerReducer.class); conf.setOutputFormat(NullOutputFormat.class); // Ensure the JARs we provide take precedence over ones from Hadoop: conf.setBoolean("mapreduce.job.user.classpath.first", true); // Also set reduce speculative execution off, avoiding duplicate submissions to Solr. conf.setBoolean("mapreduce.reduce.speculative", false); // Note that we need this to ensure FileSystem.get is thread-safe: // @see https://issues.apache.org/jira/browse/HDFS-925 // @see https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E conf.setBoolean("fs.hdfs.impl.disable.cache", true); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WritableSolrRecord.class); conf.setNumReduceTasks(shards); // number of reducers = number of shards cacheSolrHome(conf, solrHomeZipName); JobClient.runJob(conf); return 0; }
From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java
License:Apache License
@Test public void testFormatWithNoPreviousRevision() throws IOException { JobConf job = new JobConf(conf); FileSystem fs = FileSystem.getLocal(conf); Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred"); Path txtFile = new Path(dir, "auto.txt"); fs.delete(dir, true);//from w w w.j av a 2 s . c om StreamWikiDumpInputFormat.setInputPaths(job, dir); Writer txtWriter = new OutputStreamWriter(fs.create(txtFile)); try { txtWriter.write( "<tree><page><header/><revision>first</revision><revision>second</revision><revision>third</revision><revision>n</revision><revision>n+1</revision></page>\n" + "<page><longlongheader/><revision>e</revision></page>\n" + "<page><long-long-long-header/><revision>f</revision></page></tree>\n"); } finally { txtWriter.flush(); txtWriter.close(); } StreamWikiDumpInputFormat format = new StreamWikiDumpInputFormat(); job.setBoolean("org.wikimedia.wikihadoop.previousRevision", false); format.configure(job); List<String> found = collect(format, job, 1); assertEquals(Arrays.asList(new String[] { "<page><header/><revision>first</revision>\n</page>\n", "<page><header/><revision>second</revision>\n</page>\n", "<page><header/><revision>third</revision>\n</page>\n", "<page><header/><revision>n</revision>\n</page>\n", "<page><header/><revision>n+1</revision>\n</page>\n", "<page><longlongheader/><revision>e</revision>\n</page>\n", "<page><long-long-long-header/><revision>f</revision>\n</page>\n", }), found); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Test public void readExcelInputFormatExcel2013LinkedWorkbook() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "excel2013linkedworkbooks.xlsx"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job);// ww w . j a va 2s . c o m InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2013linkedworkbooks.xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2013linkedworkbooks.xlsx]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2"); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns"); assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)"); assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"5\""); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Test public void readExcelInputFormatExcel2013LinkedWorkbookAlternativeLocation() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); // file//from w w w. java 2 s . c om String fileName = "excel2013linkedworkbooks.xlsx"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); // alternativeLocation String resourcePath = new File(classLoader.getResource(fileName).getFile()).getParent(); String alternativeLocation = resourcePath + File.separator + "alternatelocationlinkedwb"; Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); job.set("hadoopoffice.read.linkedworkbooks.location", alternativeLocation); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job); InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2013linkedworkbooks.xlsx]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2013linkedworkbooks.xlsx]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2"); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns"); assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)"); assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"5\""); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Test public void readExcelInputFormatExcel2003LinkedWorkbookAlternativeLocation() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "excel2003linkedworkbooks.xls"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); // alternativeLocation String resourcePath = new File(classLoader.getResource(fileName).getFile()).getParent(); String alternativeLocation = resourcePath + File.separator + "alternatelocationlinkedwb"; Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); job.set("hadoopoffice.read.linkedworkbooks.location", alternativeLocation); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job);//from www . jav a 2 s. c om InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2003linkedworkbooks.xls]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2003linkedworkbooks.xls]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2"); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns"); assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)"); assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"5\""); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Test public void readExcelInputFormatExcel2003LinkedWorkbook() throws IOException { JobConf job = new JobConf(defaultConf); ClassLoader classLoader = getClass().getClassLoader(); String fileName = "excel2003linkedworkbooks.xls"; String fileNameSpreadSheet = classLoader.getResource(fileName).getFile(); Path file = new Path(fileNameSpreadSheet); FileInputFormat.setInputPaths(job, file); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); // enable option to read linked workbooks job.setBoolean("hadoopoffice.read.linkedworkbooks", true); job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false); ExcelFileInputFormat format = new ExcelFileInputFormat(); format.configure(job);/*from w w w . j a v a 2s . c o m*/ InputSplit[] inputSplits = format.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[excel2003linkedworkbooks.xls]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[excel2003linkedworkbooks.xls]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(), "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\""); assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(), "Input Split for Excel file contains row 1 with cell 1 address == \"A1\""); assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2"); assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns"); assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)"); assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"5\""); }
From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java
License:Apache License
@Test public void writeExcelOutputFormatExcel2013SingleSheetGZipCompressed() throws IOException { // one row string and three columns ("test1","test2","test3") // (String formattedValue, String comment, String formula, String address,String // sheetName) SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1"); SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1"); SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1"); // empty row => nothing todo // one row numbers (1,2,3) SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1"); SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1"); SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1"); // one row formulas (=A3+B3) SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1"); // write/*from w w w . j a v a 2 s .co m*/ JobConf job = new JobConf(defaultConf); String fileName = "excel2013singlesheetcompressedtestout"; String tmpDir = tmpPath.toString(); Path outputPath = new Path(tmpDir); FileOutputFormat.setOutputPath(job, outputPath); // set generic outputformat settings job.set(JobContext.TASK_ATTEMPT_ID, attempt); job.setBoolean("mapreduce.output.fileoutputformat.compress", true); job.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec"); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new // Excel // format, // anyway // default, // but // here // for // illustrative // purposes ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat(); RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job, fileName, null); assertNotNull(writer, "Format returned null RecordWriter"); writer.write(null, a1); writer.write(null, b1); writer.write(null, c1); writer.write(null, a3); writer.write(null, b3); writer.write(null, c3); writer.write(null, a4); writer.close(reporter); // try to read it again job = new JobConf(defaultConf); Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator + "_temporary" + File.separator + attempt + File.separator + fileName + ".xlsx.gz"); FileInputFormat.setInputPaths(job, inputFile); // set locale to the one of the test data job.set("hadoopoffice.read.locale.bcp47", "de"); ExcelFileInputFormat inputFormat = new ExcelFileInputFormat(); inputFormat.configure(job); InputSplit[] inputSplits = inputFormat.getSplits(job, 1); assertEquals(1, inputSplits.length, "Only one split generated for Excel file"); RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter); assertNotNull(reader, "Format returned null RecordReader"); Text spreadSheetKey = new Text(); ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1"); assertEquals("[" + fileName + ".xlsx.gz]Sheet1!A1", spreadSheetKey.toString(), "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx.gz]Sheet1!A1\""); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns"); assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 1 == \"test1\""); assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 2 == \"test2\""); assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 1 with cell 3 == \"test3\""); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2"); assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 2 and is empty"); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 3"); assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contain row 3 with 3 columns"); assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 3 with cell 1 == \"1\""); assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(), "Input Split for Excel file contains row 3 with cell 2 == \"2\""); assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(), "Input Split for Excel file contains row 3 with cell 3 == \"3\""); assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 4"); assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contain row 4 with 1 column"); assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(), "Input Split for Excel file contains row 3 with cell 1 == \"3\""); }