Example usage for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value)

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:org.terrier.applications.HadoopIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args// w w  w. jav  a 2  s .  co m
 * @throws Exception
 */
public static void main(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.info("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return;
    } else if (args.length == 0) {
        logger.info("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    } else {
        logger.fatal(usage());
        return;
    }

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJobName("terrierIndexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return;
    }

    boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));
    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    JobID jobId = null;
    boolean ranOK = true;
    try {
        RunningJob rj = JobClient.runJob(conf);
        jobId = rj.getID();
        HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) {
        logger.error("Problem running job", e);
        ranOK = false;
    }
    if (jobId != null) {
        deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }
    if (ranOK) {
        if (!docPartitioned) {
            if (numberOfReducers > 1)
                mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        }

        Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
                docPartitioned ? numberOfReducers : 1, jf);
    }
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
}

From source file:org.terrier.utility.io.HadoopUtility.java

License:Mozilla Public License

protected static void saveClassPathToJob(JobConf jobConf) throws IOException {
    logger.info("Copying classpath to job");
    if (jobConf.getBoolean("terrier.classpath.copied", false)) {
        return;/*from  ww w  . j  a  v  a  2  s  .  co  m*/
    }
    jobConf.setBoolean("terrier.classpath.copied", true);
    final String[] jars = findJarFiles(
            new String[] { System.getenv().get("CLASSPATH"), System.getProperty("java.class.path") });
    final FileSystem defFS = FileSystem.get(jobConf);
    for (String jarFile : jars) {
        //logger.debug("Adding " + jarFile + " to job class path");
        Path srcJarFilePath = new Path("file:///" + jarFile);
        String filename = srcJarFilePath.getName();
        Path tmpJarFilePath = makeTemporaryFile(jobConf, filename);
        defFS.copyFromLocalFile(srcJarFilePath, tmpJarFilePath);
        DistributedCache.addFileToClassPath(tmpJarFilePath, jobConf);
    }
    DistributedCache.createSymlink(jobConf);
}

From source file:org.warcbase.index.IndexerReducer.java

License:Apache License

@Override
public void configure(JobConf job) {
    LOG.info("Configuring reducer...");

    // Initialize the embedded server.
    try {//from w ww  .  j  a va  2 s.c  om
        job.setBoolean("fs.hdfs.impl.disable.cache", true);
        fs = FileSystem.get(job);
        solrHome = Solate.findSolrConfig(job, IndexerRunner.solrHomeZipName);
        LOG.info("Found solrHomeDir " + solrHome);
    } catch (IOException e) {
        e.printStackTrace();
        LOG.error("FAILED in reducer configuration: " + e);
    }
    outputDir = new Path(job.get(HDFS_OUTPUT_PATH));
    LOG.info("HDFS index output path: " + outputDir);

    LOG.info("Initialization complete.");
}

From source file:org.warcbase.index.IndexerRunner.java

License:Apache License

@SuppressWarnings("static-access")
public int run(String[] args) throws IOException, ParseException {
    LOG.info("Initializing indexer...");

    Options options = new Options();

    options.addOption(/*  www.  j av  a 2 s  . co m*/
            OptionBuilder.withArgName("file").hasArg().withDescription("input file list").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS index output path")
            .create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of shards")
            .create(SHARDS_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("config file (optional)")
            .create(CONFIG_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)
            || !cmdline.hasOption(SHARDS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String configPath = null;
    if (cmdline.hasOption(CONFIG_OPTION)) {
        configPath = cmdline.getOptionValue(CONFIG_OPTION);
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(INDEX_OPTION);
    int shards = Integer.parseInt(cmdline.getOptionValue(SHARDS_OPTION));

    JobConf conf = new JobConf(getConf(), IndexerRunner.class);

    if (configPath == null) {
        LOG.info("Config not specified, using default src/main/solr/WARCIndexer.conf");
        configPath = "src/main/solr/WARCIndexer.conf";
    }
    File configFile = new File(configPath);
    if (!configFile.exists()) {
        LOG.error("Error: config does not exist!");
        System.exit(-1);
    }
    Config config = ConfigFactory.parseFile(configFile);
    conf.set(CONFIG_PROPERTIES, config.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));

    FileSystem fs = FileSystem.get(conf);

    LOG.info("HDFS index output path: " + outputPath);
    conf.set(IndexerReducer.HDFS_OUTPUT_PATH, outputPath);
    if (fs.exists(new Path(outputPath))) {
        LOG.error("Error: path exists already!");
        System.exit(-1);
    }

    LOG.info("Number of shards: " + shards);
    conf.setInt(IndexerMapper.NUM_SHARDS, shards);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    conf.setJobName(IndexerRunner.class.getSimpleName() + ": " + inputPath);
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(IndexerMapper.class);
    conf.setReducerClass(IndexerReducer.class);
    conf.setOutputFormat(NullOutputFormat.class);

    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.job.user.classpath.first", true);
    // Also set reduce speculative execution off, avoiding duplicate submissions to Solr.
    conf.setBoolean("mapreduce.reduce.speculative", false);

    // Note that we need this to ensure FileSystem.get is thread-safe:
    // @see https://issues.apache.org/jira/browse/HDFS-925
    // @see https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E
    conf.setBoolean("fs.hdfs.impl.disable.cache", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WritableSolrRecord.class);
    conf.setNumReduceTasks(shards); // number of reducers = number of shards

    cacheSolrHome(conf, solrHomeZipName);

    JobClient.runJob(conf);

    return 0;
}

From source file:org.wikimedia.wikihadoop.TestStreamWikiDumpInputFormat.java

License:Apache License

@Test
public void testFormatWithNoPreviousRevision() throws IOException {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path txtFile = new Path(dir, "auto.txt");

    fs.delete(dir, true);//from  w w  w.j av  a 2 s  .  c  om

    StreamWikiDumpInputFormat.setInputPaths(job, dir);

    Writer txtWriter = new OutputStreamWriter(fs.create(txtFile));
    try {
        txtWriter.write(
                "<tree><page><header/><revision>first</revision><revision>second</revision><revision>third</revision><revision>n</revision><revision>n+1</revision></page>\n"
                        + "<page><longlongheader/><revision>e</revision></page>\n"
                        + "<page><long-long-long-header/><revision>f</revision></page></tree>\n");
    } finally {
        txtWriter.flush();
        txtWriter.close();
    }

    StreamWikiDumpInputFormat format = new StreamWikiDumpInputFormat();
    job.setBoolean("org.wikimedia.wikihadoop.previousRevision", false);
    format.configure(job);

    List<String> found = collect(format, job, 1);

    assertEquals(Arrays.asList(new String[] { "<page><header/><revision>first</revision>\n</page>\n",
            "<page><header/><revision>second</revision>\n</page>\n",
            "<page><header/><revision>third</revision>\n</page>\n",
            "<page><header/><revision>n</revision>\n</page>\n",
            "<page><header/><revision>n+1</revision>\n</page>\n",
            "<page><longlongheader/><revision>e</revision>\n</page>\n",
            "<page><long-long-long-header/><revision>f</revision>\n</page>\n", }), found);
}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void readExcelInputFormatExcel2013LinkedWorkbook() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "excel2013linkedworkbooks.xlsx";
    String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
    Path file = new Path(fileNameSpreadSheet);
    FileInputFormat.setInputPaths(job, file);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat format = new ExcelFileInputFormat();
    format.configure(job);//  ww w .  j  a va 2s .  c  o  m
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
    assertEquals("[excel2013linkedworkbooks.xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[excel2013linkedworkbooks.xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
            "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
    assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
            "Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
    assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)");
    assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"5\"");

}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void readExcelInputFormatExcel2013LinkedWorkbookAlternativeLocation() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    // file//from w  w w.  java 2  s  . c  om
    String fileName = "excel2013linkedworkbooks.xlsx";
    String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
    // alternativeLocation
    String resourcePath = new File(classLoader.getResource(fileName).getFile()).getParent();
    String alternativeLocation = resourcePath + File.separator + "alternatelocationlinkedwb";
    Path file = new Path(fileNameSpreadSheet);
    FileInputFormat.setInputPaths(job, file);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    job.set("hadoopoffice.read.linkedworkbooks.location", alternativeLocation);
    ExcelFileInputFormat format = new ExcelFileInputFormat();
    format.configure(job);
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
    assertEquals("[excel2013linkedworkbooks.xlsx]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[excel2013linkedworkbooks.xlsx]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
            "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
    assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
            "Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
    assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)");
    assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"5\"");

}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void readExcelInputFormatExcel2003LinkedWorkbookAlternativeLocation() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "excel2003linkedworkbooks.xls";
    String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
    // alternativeLocation
    String resourcePath = new File(classLoader.getResource(fileName).getFile()).getParent();
    String alternativeLocation = resourcePath + File.separator + "alternatelocationlinkedwb";
    Path file = new Path(fileNameSpreadSheet);
    FileInputFormat.setInputPaths(job, file);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    job.set("hadoopoffice.read.linkedworkbooks.location", alternativeLocation);
    ExcelFileInputFormat format = new ExcelFileInputFormat();
    format.configure(job);//from www .  jav  a 2 s. c  om
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
    assertEquals("[excel2003linkedworkbooks.xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[excel2003linkedworkbooks.xls]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
            "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
    assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
            "Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
    assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)");
    assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"5\"");

}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void readExcelInputFormatExcel2003LinkedWorkbook() throws IOException {
    JobConf job = new JobConf(defaultConf);
    ClassLoader classLoader = getClass().getClassLoader();
    String fileName = "excel2003linkedworkbooks.xls";
    String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
    Path file = new Path(fileNameSpreadSheet);
    FileInputFormat.setInputPaths(job, file);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    // enable option to read linked workbooks
    job.setBoolean("hadoopoffice.read.linkedworkbooks", true);
    job.setBoolean("hadoopoffice.read.ignoremissinglinkedworkbooks", false);
    ExcelFileInputFormat format = new ExcelFileInputFormat();
    format.configure(job);/*from   w w w  .  j  a v a 2s .  c  o  m*/
    InputSplit[] inputSplits = format.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
    assertEquals("[excel2003linkedworkbooks.xls]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[excel2003linkedworkbooks.xls]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("Sheet1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getSheetName(),
            "Input Split for Excel file contains row 1 with cell 1 sheetname == \"Sheet1\"");
    assertEquals("A1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getAddress(),
            "Input Split for Excel file contains row 1 with cell 1 address == \"A1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
    assertEquals(2, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 2 columns");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"3\" (this tests also if the cached value of 6 is ignored)");
    assertEquals("5", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"5\"");

}

From source file:org.zuinnote.hadoop.office.format.mapred.OfficeFormatHadoopExcelNormalTest.java

License:Apache License

@Test
public void writeExcelOutputFormatExcel2013SingleSheetGZipCompressed() throws IOException {
    // one row string and three columns ("test1","test2","test3")
    // (String formattedValue, String comment, String formula, String address,String
    // sheetName)
    SpreadSheetCellDAO a1 = new SpreadSheetCellDAO("test1", "", "", "A1", "Sheet1");
    SpreadSheetCellDAO b1 = new SpreadSheetCellDAO("test2", "", "", "B1", "Sheet1");
    SpreadSheetCellDAO c1 = new SpreadSheetCellDAO("test3", "", "", "C1", "Sheet1");
    // empty row => nothing todo
    // one row numbers (1,2,3)
    SpreadSheetCellDAO a3 = new SpreadSheetCellDAO("", "", "1", "A3", "Sheet1");
    SpreadSheetCellDAO b3 = new SpreadSheetCellDAO("", "", "2", "B3", "Sheet1");
    SpreadSheetCellDAO c3 = new SpreadSheetCellDAO("", "", "3", "C3", "Sheet1");
    // one row formulas (=A3+B3)
    SpreadSheetCellDAO a4 = new SpreadSheetCellDAO("", "", "A3+B3", "A4", "Sheet1");
    // write/*from  w  w  w . j  a v a  2  s  .co m*/
    JobConf job = new JobConf(defaultConf);
    String fileName = "excel2013singlesheetcompressedtestout";
    String tmpDir = tmpPath.toString();
    Path outputPath = new Path(tmpDir);
    FileOutputFormat.setOutputPath(job, outputPath);
    // set generic outputformat settings
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    job.setBoolean("mapreduce.output.fileoutputformat.compress", true);
    job.set("mapreduce.output.fileoutputformat.compress.codec", "org.apache.hadoop.io.compress.GzipCodec");
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    job.set("hadoopoffice.write.mimeType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"); // new
    // Excel
    // format,
    // anyway
    // default,
    // but
    // here
    // for
    // illustrative
    // purposes
    ExcelFileOutputFormat outputFormat = new ExcelFileOutputFormat();
    RecordWriter<NullWritable, SpreadSheetCellDAO> writer = outputFormat.getRecordWriter(null, job, fileName,
            null);
    assertNotNull(writer, "Format returned  null RecordWriter");
    writer.write(null, a1);
    writer.write(null, b1);
    writer.write(null, c1);
    writer.write(null, a3);
    writer.write(null, b3);
    writer.write(null, c3);
    writer.write(null, a4);
    writer.close(reporter);
    // try to read it again
    job = new JobConf(defaultConf);
    Path inputFile = new Path(tmpDir + File.separator + "_temporary" + File.separator + "0" + File.separator
            + "_temporary" + File.separator + attempt + File.separator + fileName + ".xlsx.gz");
    FileInputFormat.setInputPaths(job, inputFile);
    // set locale to the one of the test data
    job.set("hadoopoffice.read.locale.bcp47", "de");
    ExcelFileInputFormat inputFormat = new ExcelFileInputFormat();
    inputFormat.configure(job);
    InputSplit[] inputSplits = inputFormat.getSplits(job, 1);
    assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
    RecordReader<Text, ArrayWritable> reader = inputFormat.getRecordReader(inputSplits[0], job, reporter);
    assertNotNull(reader, "Format returned  null RecordReader");
    Text spreadSheetKey = new Text();
    ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
    assertEquals("[" + fileName + ".xlsx.gz]Sheet1!A1", spreadSheetKey.toString(),
            "Input Split for Excel file has keyname == \"[" + fileName + ".xlsx.gz]Sheet1!A1\"");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contains row 1 with 3 columns");
    assertEquals("test1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 1 == \"test1\"");
    assertEquals("test2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 2 == \"test2\"");
    assertEquals("test3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 1 with cell 3 == \"test3\"");
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 2");
    assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 2 and is empty");
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 3");
    assertEquals(3, spreadSheetValue.get().length, "Input Split for Excel file contain row 3 with 3 columns");
    assertEquals("1", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"1\"");
    assertEquals("2", ((SpreadSheetCellDAO) spreadSheetValue.get()[1]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 2 == \"2\"");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[2]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 3 == \"3\"");
    assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 4");
    assertEquals(1, spreadSheetValue.get().length, "Input Split for Excel file contain row 4 with 1 column");
    assertEquals("3", ((SpreadSheetCellDAO) spreadSheetValue.get()[0]).getFormattedValue(),
            "Input Split for Excel file contains row 3 with cell 1 == \"3\"");
}