Example usage for org.apache.hadoop.mapred JobConf setBoolean

List of usage examples for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value) 

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:com.yolodata.tbana.cascading.csv.CSVLine.java

License:Open Source License

@Override
public void sourceConfInit(FlowProcess<JobConf> flowProcess, Tap<JobConf, RecordReader, OutputCollector> tap,
        JobConf conf) {
    if (hasZippedFiles(FileInputFormat.getInputPaths(conf)))
        throw new IllegalStateException(
                "cannot read zip files: " + Arrays.toString(FileInputFormat.getInputPaths(conf)));

    conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER);
    conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR);
    conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false);
    conf.setInt(CSVNLineInputFormat.LINES_PER_MAP, 40000);

    conf.setInputFormat(CSVNLineInputFormat.class);
}

From source file:de.l3s.streamcorpus.mapreduce.TerrierIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args//w w w. ja  v  a  2 s.com
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(TerrierIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}

From source file:de.l3s.streamcorpus.StreamCorpusIndexing.java

License:Mozilla Public License

/** Starts the MapReduce indexing.
 * @param args/*  w  w  w.j  av  a2  s  .com*/
 * @throws Exception
 */
public int run(String[] args) throws Exception {
    long time = System.currentTimeMillis();

    // For the moment: Hard-code the terrier home to quick test
    System.setProperty("terrier.home", "/home/tuan.tran/executable/StreamCorpusIndexer");

    boolean docPartitioned = false;
    int numberOfReducers = Integer
            .parseInt(ApplicationSetup.getProperty("terrier.hadoop.indexing.reducers", "26"));
    final HadoopPlugin.JobFactory jf = HadoopPlugin.getJobFactory("HOD-TerrierIndexing");
    if (args.length == 2 && args[0].equals("-p")) {
        logger.debug("Document-partitioned Mode, " + numberOfReducers + " output indices.");
        numberOfReducers = Integer.parseInt(args[1]);
        docPartitioned = true;
    } else if (args.length == 1 && args[0].equals("--merge")) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
        else
            logger.error("No point merging 1 reduce task output");
        return 0;
    } else if (args.length == 0) {
        logger.debug("Term-partitioned Mode, " + numberOfReducers + " reducers creating one inverted index.");
        docPartitioned = false;
        if (numberOfReducers > MAX_REDUCE) {
            logger.warn("Excessive reduce tasks (" + numberOfReducers + ") in use "
                    + "- SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm can use " + MAX_REDUCE + " at most");
        }
    }

    /*else
    {
       logger.fatal(usage());
       return 0;
    }*/

    if (!(CompressionFactory.getCompressionConfiguration("inverted", new String[0],
            false) instanceof BitCompressionConfiguration)) {
        logger.error("Sorry, only default BitCompressionConfiguration is supported by HadoopIndexing"
                + " - you can recompress the inverted index later using IndexRecompressor");
        return 0;
    }

    if (jf == null)
        throw new Exception("Could not get JobFactory from HadoopPlugin");
    final JobConf conf = jf.newJob();
    conf.setJarByClass(StreamCorpusIndexing.class);
    conf.setJobName("StreamCorpusIndexer: Terrier Indexing");
    if (Files.exists(ApplicationSetup.TERRIER_INDEX_PATH)
            && Index.existsIndex(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX)) {
        logger.fatal("Cannot index while index exists at " + ApplicationSetup.TERRIER_INDEX_PATH + ","
                + ApplicationSetup.TERRIER_INDEX_PREFIX);
        return 0;
    }

    // boolean blockIndexing = ApplicationSetup.BLOCK_INDEXING;
    boolean blockIndexing = true;
    if (blockIndexing) {
        conf.setMapperClass(Hadoop_BlockSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BlockSinglePassIndexer.class);
    } else {
        conf.setMapperClass(Hadoop_BasicSinglePassIndexer.class);
        conf.setReducerClass(Hadoop_BasicSinglePassIndexer.class);
    }
    FileOutputFormat.setOutputPath(conf, new Path(ApplicationSetup.TERRIER_INDEX_PATH));
    conf.set("indexing.hadoop.prefix", ApplicationSetup.TERRIER_INDEX_PREFIX);
    conf.setMapOutputKeyClass(SplitEmittedTerm.class);
    conf.setMapOutputValueClass(MapEmittedPostingList.class);
    conf.setBoolean("indexing.hadoop.multiple.indices", docPartitioned);

    if (!conf.get("mapred.job.tracker").equals("local")) {
        conf.setMapOutputCompressorClass(GzipCodec.class);
        conf.setCompressMapOutput(true);
    } else {
        conf.setCompressMapOutput(false);
    }

    conf.setInputFormat(MultiFileCollectionInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setOutputKeyComparatorClass(SplitEmittedTerm.SETRawComparatorTermSplitFlush.class);
    conf.setOutputValueGroupingComparator(SplitEmittedTerm.SETRawComparatorTerm.class);
    conf.setReduceSpeculativeExecution(false);
    //parse the collection.spec
    BufferedReader specBR = Files.openFileReader(ApplicationSetup.COLLECTION_SPEC);
    String line = null;
    List<Path> paths = new ArrayList<Path>();
    while ((line = specBR.readLine()) != null) {
        if (line.startsWith("#"))
            continue;
        paths.add(new Path(line));
    }
    specBR.close();
    FileInputFormat.setInputPaths(conf, paths.toArray(new Path[paths.size()]));

    // not sure if this is effective in YARN
    conf.setNumMapTasks(2000);

    // increase the heap usage
    conf.set("mapreduce.map.memory.mb", "6100");
    conf.set("mapred.job.map.memory.mb", "6100");
    conf.set("mapreduce.reduce.memory.mb", "6144");
    conf.set("mapred.job.reduce.memory.mb", "6144");

    conf.set("mapreduce.map.java.opts", "-Xmx6100m");
    conf.set("mapred.map.child.java.opts", "-Xmx6100m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx6144m");
    conf.set("mapred.reduce.child.opts", "-Xmx6144m");

    //conf.setBoolean("mapred.used.genericoptionsparser", true) ;

    // This is the nasty thing in MapReduce v2 and YARN: They always prefer their ancient jars first. Set this on to say you don't like it
    conf.set("mapreduce.job.user.classpath.first", "true");

    // increase the yarn memory to 10 GB
    conf.set("yarn.nodemanager.resource.memory-mb", "12288");
    conf.set("yarn.nodemanager.resource.cpu-vcores", "16");
    conf.set("yarn.scheduler.minimum-allocation-mb", "4096");

    conf.setNumReduceTasks(numberOfReducers);
    if (numberOfReducers > 1) {
        if (docPartitioned)
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitioner.class);
        else
            conf.setPartitionerClass(SplitEmittedTerm.SETPartitionerLowercaseAlphaTerm.class);
    } else {
        //for JUnit tests, we seem to need to restore the original partitioner class
        conf.setPartitionerClass(HashPartitioner.class);
    }

    /*JobID jobId = null;
    boolean ranOK = true;
    try{
       RunningJob rj = JobClient.runJob(conf);
       jobId = rj.getID();
       HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) { 
       logger.error("Problem running job", e);
       e.printStackTrace();
       ranOK = false;
    }
    if (jobId != null)
    {
       deleteTaskFiles(ApplicationSetup.TERRIER_INDEX_PATH, jobId);
    }  */

    //if (ranOK)
    //{
    System.out.println("Merging indices");
    if (!docPartitioned) {
        if (numberOfReducers > 1)
            mergeLexiconInvertedFiles(ApplicationSetup.TERRIER_INDEX_PATH, numberOfReducers);
    }

    Hadoop_BasicSinglePassIndexer.finish(ApplicationSetup.TERRIER_INDEX_PATH,
            docPartitioned ? numberOfReducers : 1, jf);
    //}
    System.out.println("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
    jf.close();
    return 0;
}

From source file:edu.iu.examples.ExamplesMain.java

License:Apache License

private Job configureBenchmarkJob(String cmd, int bytesPerPartition, int numPartitions, int numMappers,
        int numIterations, boolean verify, Path inputDirPath, Path outputDirPath) throws IOException {
    Job job = Job.getInstance(getConf(), "example_job");
    FileInputFormat.setInputPaths(job, inputDirPath);
    FileOutputFormat.setOutputPath(job, outputDirPath);
    job.setInputFormatClass(SingleFileInputFormat.class);
    job.setJarByClass(ExamplesMain.class);
    if (cmd.equals("allreduce")) {
        job.setMapperClass(AllReduce.class);
    } else if (cmd.equals("allgather")) {
        job.setMapperClass(AllGather.class);
    } else if (cmd.equals("reduce")) {
        job.setMapperClass(Reduce.class);
    } else if (cmd.equals("bcast")) {
        job.setMapperClass(BCast.class);
    } else if (cmd.equals("rotate")) {
        job.setMapperClass(Rotate.class);
    }//ww w . ja va 2s  .  c  o m
    org.apache.hadoop.mapred.JobConf jobConf = (JobConf) job.getConfiguration();
    jobConf.set("mapreduce.framework.name", "map-collective");
    jobConf.setNumMapTasks(numMappers);
    job.setNumReduceTasks(0);
    jobConf.set(Constants.ARGS_OPERATION, cmd);
    jobConf.setInt(Constants.ARGS_ELEMENTS, bytesPerPartition);
    jobConf.setInt(Constants.ARGS_PARTITIONS, numPartitions);
    jobConf.setInt(Constants.ARGS_MAPPERS, numMappers);
    jobConf.setInt(Constants.ARGS_ITERATIONS, numIterations);
    jobConf.setBoolean(Constants.ARGS_VERIFY, verify);
    return job;
}

From source file:edu.stolaf.cs.wmrserver.HadoopEngine.java

License:Apache License

public void submit(JobRequest request, long submissionID, File mapperFile, File reducerFile, File packageDir,
        Path inputPath) throws ValidationException, NotFoundException, CompilationException, InternalException {
    // Generate job output path
    Path outputDir = new Path(_homeDir, "out");
    Path outputPath;/*from w ww  .jav a 2 s  .  c o  m*/
    try {
        FileSystem fs = outputDir.getFileSystem(new Configuration());
        outputPath = JobServiceHandler.getNonexistantPath(outputDir, request.getName(), fs);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not construct output path.", ex);
    }

    JobConf conf = new JobConf();
    conf.setJobName(request.getName());

    // Set mapper and number of tasks if specified
    StreamJob.setStreamMapper(conf, mapperFile.toString());
    if (request.isSetMapTasks())
        conf.setNumMapTasks(request.getMapTasks());

    // Set reducer and number of tasks if specified
    StreamJob.setStreamReducer(conf, reducerFile.toString());
    if (request.isSetReduceTasks())
        conf.setNumReduceTasks(request.getReduceTasks());

    // Create and set job JAR, including necessary files
    ArrayList<String> jarFiles = new ArrayList<String>();
    jarFiles.add(packageDir.toString());
    String jarPath;
    try {
        jarPath = StreamJob.createJobJar(conf, jarFiles, _tempDir);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("Could not create job jar.", ex);
    }
    if (jarPath != null)
        conf.setJar(jarPath);

    // TODO: This is a hack. Rewrite streaming to use DistributedCache.
    //conf.setPattern("mapreduce.job.jar.unpack.pattern",
    //              Pattern.compile(".*"));

    // Set I/O formats and paths
    conf.setInputFormat(KeyValueTextInputFormat.class);
    conf.setOutputFormat(TextOutputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    FileInputFormat.addInputPath(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // Use numeric sort if appropriate
    conf.setBoolean(CONF_NUMERIC, request.isNumericSort());
    if (request.isNumericSort()) {
        conf.setOutputKeyComparatorClass(KeyFieldBasedComparator.class);
        conf.setPartitionerClass(KeyFieldBasedPartitioner.class);
        conf.setKeyFieldComparatorOptions("-n");
        conf.setKeyFieldPartitionerOptions("-n");
    }

    // Set other job information
    conf.set(CONF_USER, request.getUser());
    conf.set(CONF_LANGUAGE, request.getLanguage());
    conf.set(CONF_MAPPER, request.getMapper());
    conf.set(CONF_REDUCER, request.getReducer());

    // Attempt to submit the job

    RunningJob job;
    try {
        JobClient client = new JobClient(new JobConf());
        job = client.submitJob(conf);
    } catch (IOException ex) {
        throw JobServiceHandler.wrapException("There was a serious error while attempting to submit the job.",
                ex);
    }

    try {
        SubmissionDatabase.setSubmitted(submissionID);
        SubmissionDatabase.setHadoopID(submissionID, job.getID().toString());
    } catch (SQLException ex) {
        throw JobServiceHandler.wrapException("Could not update submission in database.", ex);
    }
}

From source file:edu.ucsb.cs.hybrid.HybridDriver.java

License:Apache License

public static void main(String args[]) throws ParseException, IOException {

    // job.set("mapred.job.tracker", "local");
    // job.set("fs.default.name", "file:///");

    JobConf job = new JobConf();
    job.setJarByClass(HybridDriver.class);
    new GenericOptionsParser(job, args);
    setMapperAndRunner(job);/*from w ww . j  a v  a2 s.  co  m*/
    job.setMapOutputKeyClass(DocDocWritable.class);
    job.setMapOutputValueClass(FloatWritable.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(DocDocWritable.class);
    job.setOutputValueClass(FloatWritable.class);

    Path inputPath = new Path(INPUT_DIR);
    CustomSequenceFileInputFormat.addInputPath(job, inputPath);
    Path outputPath = new Path(OUTPUT_DIR);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, outputPath);
    FileSystem.get(job).delete(outputPath, true);

    job.setBoolean("fs.hdfs.impl.disable.cache", true); //xun not sure if needed

    if (job.getBoolean(Config.SPLITABLE_PROPERTY, Config.SPLITABLE_VALUE)) {
        job.setInputFormat(CustomSequenceFileInputFormat.class);
        Long splitMB = job.getLong(Config.SPLIT_MB_PROPERTY, Config.SPLIT_MB_VALUE) * 1024 * 1024;
        job.setLong("mapred.min.split.size", splitMB);
        job.setLong("mapred.max.split.size", splitMB);
        job.setLong("dfs.block.size", splitMB);
    } else {
        //  Comment the following of splitter for www experiments it assumes no splitting
        // of partitions for load balancing, should be fixed.
        Splitter.configure(job, inputPath);// remove comment unless for www
        job.setInputFormat(NonSplitableSequenceInputFormat.class); //remove comment
    }
    //SIGIR'14 two-stage balancing //not yet fully incorporated 
    if (job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE) != 0) {
        TwoStageLoadbalancing.main(job.getInt(Config.LOAD_BALANCE_PROPERTY, Config.LOAD_BALANCE_VALUE),
                new Path(PartDriver.OUTPUT_DIR), job);
    }
    JobSubmitter.run(job, "SIMILARITY", job.getFloat(Config.THRESHOLD_PROPERTY, Config.THRESHOLD_VALUE));
    if (job.getBoolean(Config.CONVERT_TEXT_PROPERTY, Config.CONVERT_TEXT_VALUE))
        IDMappingJob(args);
}

From source file:edu.umd.cloud9.collection.wikipedia.BuildWikipediaDocnoMapping.java

License:Apache License

@SuppressWarnings("static-access")
@Override// w w w .  j a v  a  2 s .  c o  m
public int run(String[] args) throws Exception {
    Options options = new Options();
    options.addOption(
            OptionBuilder.withArgName("path").hasArg().withDescription("XML dump file").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("tmp output directory")
            .create(OUTPUT_PATH_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output file")
            .create(OUTPUT_FILE_OPTION));
    options.addOption(OptionBuilder.withArgName("en|sv|de|cs|es|zh|ar|tr").hasArg()
            .withDescription("two-letter language code").create(LANGUAGE_OPTION));
    options.addOption(KEEP_ALL_OPTION, false, "keep all pages");

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(OUTPUT_PATH_OPTION)
            || !cmdline.hasOption(OUTPUT_FILE_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String language = null;
    if (cmdline.hasOption(LANGUAGE_OPTION)) {
        language = cmdline.getOptionValue(LANGUAGE_OPTION);
        if (language.length() != 2) {
            System.err.println("Error: \"" + language + "\" unknown language!");
            return -1;
        }
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(OUTPUT_PATH_OPTION);
    String outputFile = cmdline.getOptionValue(OUTPUT_FILE_OPTION);
    boolean keepAll = cmdline.hasOption(KEEP_ALL_OPTION);

    LOG.info("Tool name: " + this.getClass().getName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - output file: " + outputFile);
    LOG.info(" - keep all pages: " + keepAll);
    LOG.info(" - language: " + language);

    JobConf conf = new JobConf(getConf(), BuildWikipediaDocnoMapping.class);
    conf.setJobName(String.format("BuildWikipediaDocnoMapping[%s: %s, %s: %s, %s: %s]", INPUT_OPTION, inputPath,
            OUTPUT_FILE_OPTION, outputFile, LANGUAGE_OPTION, language));

    conf.setBoolean(KEEP_ALL_OPTION, keepAll);
    if (language != null) {
        conf.set("wiki.language", language);
    }
    conf.setNumReduceTasks(1);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    conf.setInputFormat(WikipediaPageInputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(IntWritable.class);
    conf.setOutputFormat(TextOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    RunningJob job = JobClient.runJob(conf);
    Counters c = job.getCounters();
    long cnt = keepAll ? c.getCounter(PageTypes.TOTAL) : c.getCounter(PageTypes.ARTICLE);

    WikipediaDocnoMapping.writeDocnoMappingData(FileSystem.get(conf), outputPath + "/part-00000", (int) cnt,
            outputFile);

    return 0;
}

From source file:edu.umn.cs.spatialHadoop.operations.DistributedJoin.java

License:Open Source License

/**
 * Spatially joins two datasets by repartitioning the smaller dataset based
 * on the larger one, then apply one-to-one joining for each partition
 * /*  ww  w .  j av  a  2  s  .c o  m*/
 * @author Ibrahim Sabek
 * @param inputFiles
 *            Input datasets to be spatially joined
 * @param fileToRepartition
 *            Index of which file will be repartitioned
 * @param outputFile
 *            Output file contains the joining results
 * @param params
 *            Job configurations
 * @return
 * @throws IOException
 */
protected static long repartitionJoinStep(final Path[] inputFiles, int fileToRepartition, Path outputFile,
        OperationsParams params) throws IOException {

    boolean overwrite = params.getBoolean("overwrite", false);
    Shape stockShape = params.getShape("shape");

    // Do the repartition step
    long t1 = System.currentTimeMillis();

    JobConf repartitionJoinJob = new JobConf(params, DistributedJoin.class);
    repartitionJoinJob.setJobName("RepartitionJoin");

    FileSystem fs = inputFiles[fileToRepartition].getFileSystem(params);

    Path outputPath = outputFile;
    if (outputPath == null) {
        do {
            outputPath = new Path(inputFiles[0].getName() + ".dj_" + (int) (Math.random() * 1000000));
        } while (fs.exists(outputPath));
    }

    LOG.info("Repartition - Joining " + inputFiles[0] + " X " + inputFiles[1]);

    // Get the cells to use for repartitioning
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, inputFiles[1 - fileToRepartition]);
    OperationsParams.setRepartitionJoinIndexPath(repartitionJoinJob, RepartitionJoinIndexPath,
            inputFiles[1 - fileToRepartition]);
    OperationsParams.setInactiveModeFlag(repartitionJoinJob, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(repartitionJoinJob, JoiningThresholdPerOnce,
            joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(repartitionJoinJob, isFilterOnlyMode, isFilterOnly);
    CellInfo[] cellsInfo = SpatialSite.cellsOf(fs, inputFiles[1 - fileToRepartition]);

    // Repartition the file to match the other file
    boolean isReplicated = gindex.isReplicated();
    boolean isCompact = gindex.isCompact();
    String sindex;
    if (isReplicated && !isCompact)
        sindex = "grid";
    else if (isReplicated && isCompact)
        sindex = "r+tree";
    else if (!isReplicated && isCompact)
        sindex = "rtree";
    else
        throw new RuntimeException("Unknown index at: " + inputFiles[1 - fileToRepartition]);
    params.set("sindex", sindex);

    // Decide which map function to use based on the type of global index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        repartitionJoinJob.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        repartitionJoinJob.setMapperClass(RepartitionMap.class);
    }
    repartitionJoinJob.setMapOutputKeyClass(IntWritable.class);
    repartitionJoinJob.setMapOutputValueClass(stockShape.getClass());
    ShapeInputFormat.setInputPaths(repartitionJoinJob, inputFiles[fileToRepartition]);
    repartitionJoinJob.setInputFormat(ShapeInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(repartitionJoinJob).getClusterStatus();
    repartitionJoinJob.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    SpatialSite.setCells(repartitionJoinJob, cellsInfo);
    repartitionJoinJob.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // set reduce function
    repartitionJoinJob.setReducerClass(RepartitionJoinReduce.class);
    repartitionJoinJob.setNumReduceTasks(
            Math.max(1, Math.min(cellsInfo.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    repartitionJoinJob.setOutputFormat(TextOutputFormat.class);
    TextOutputFormat.setOutputPath(repartitionJoinJob, outputPath);

    RunningJob runningJob = JobClient.runJob(repartitionJoinJob);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    // Output number of running map tasks
    Counter mapTaskCountCounter = counters.findCounter(JobInProgress.Counter.TOTAL_LAUNCHED_MAPS);
    System.out.println("Number of map tasks " + mapTaskCountCounter.getValue());

    // Delete output directory if not explicitly set by user
    if (outputFile == null)
        fs.delete(outputPath, true);
    long t2 = System.currentTimeMillis();
    System.out.println("Repartitioning and Joining time " + (t2 - t1) + " millis");

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.operations.Plot.java

License:Apache License

public static <S extends Shape> void plotMapReduce(Path inFile, Path outFile, Shape shape, int width,
        int height, Color color, boolean showBorders, boolean showBlockCount, boolean showRecordCount,
        boolean background) throws IOException {
    JobConf job = new JobConf(Plot.class);
    job.setJobName("Plot");

    job.setMapperClass(PlotMap.class);
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setReducerClass(PlotReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));
    job.setMapOutputKeyClass(Rectangle.class);
    SpatialSite.setShapeClass(job, shape.getClass());
    job.setMapOutputValueClass(shape.getClass());

    FileSystem inFs = inFile.getFileSystem(job);
    Rectangle fileMbr = FileMBR.fileMBRMapReduce(inFs, inFile, shape, false);
    FileStatus inFileStatus = inFs.getFileStatus(inFile);

    CellInfo[] cellInfos;/*from w ww  .  j av a2  s  .  co m*/
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(inFs, inFile);
    if (gindex == null) {
        // A heap file. The map function should partition the file
        GridInfo gridInfo = new GridInfo(fileMbr.x1, fileMbr.y1, fileMbr.x2, fileMbr.y2);
        gridInfo.calculateCellDimensions(inFileStatus.getLen(), inFileStatus.getBlockSize());
        cellInfos = gridInfo.getAllCells();
        // Doesn't make sense to show any partition information in a heap file
        showBorders = showBlockCount = showRecordCount = false;
    } else {
        cellInfos = SpatialSite.cellsOf(inFs, inFile);
    }

    // Set cell information in the job configuration to be used by the mapper
    SpatialSite.setCells(job, cellInfos);

    // Adjust width and height to maintain aspect ratio
    if ((fileMbr.x2 - fileMbr.x1) / (fileMbr.y2 - fileMbr.y1) > (double) width / height) {
        // Fix width and change height
        height = (int) ((fileMbr.y2 - fileMbr.y1) * width / (fileMbr.x2 - fileMbr.x1));
    } else {
        width = (int) ((fileMbr.x2 - fileMbr.x1) * height / (fileMbr.y2 - fileMbr.y1));
    }
    LOG.info("Creating an image of size " + width + "x" + height);
    ImageOutputFormat.setFileMBR(job, fileMbr);
    ImageOutputFormat.setImageWidth(job, width);
    ImageOutputFormat.setImageHeight(job, height);
    job.setBoolean(ShowBorders, showBorders);
    job.setBoolean(ShowBlockCount, showBlockCount);
    job.setBoolean(ShowRecordCount, showRecordCount);
    job.setInt(StrokeColor, color.getRGB());

    // Set input and output
    job.setInputFormat(ShapeInputFormat.class);
    ShapeInputFormat.addInputPath(job, inFile);
    // Set output committer which will stitch images together after all reducers
    // finish
    job.setOutputCommitter(PlotOutputCommitter.class);

    job.setOutputFormat(ImageOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outFile);

    if (background) {
        JobClient jc = new JobClient(job);
        lastSubmittedJob = jc.submitJob(job);
    } else {
        lastSubmittedJob = JobClient.runJob(job);
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.PyramidPlot.java

License:Apache License

/**
 * Plot a file to a set of images in different zoom levels using a MapReduce
 * program.//from  w  w w  .ja  va  2  s .  c o m
 * @param <S> type of shapes stored in file
 * @param inFile - Path to the input file(s)
 * @param outFile - Path to the output file (image)
 * @param shape - A sample object to be used for parsing input file
 * @param tileWidth - With of each tile 
 * @param tileHeight - Height of each tile
 * @param vflip - Set to <code>true</code> to file the whole image vertically
 * @param color - Color used to draw single shapes
 * @param numLevels - Number of zoom levels to plot
 * @throws IOException
 */
private static <S extends Shape> RunningJob plotMapReduce(Path inFile, Path outFile, OperationsParams params)
        throws IOException {
    Color color = params.getColor("color", Color.BLACK);

    String hdfDataset = (String) params.get("dataset");
    Shape shape = hdfDataset != null ? new NASARectangle() : params.getShape("shape");
    Shape plotRange = params.getShape("rect");

    boolean background = params.is("background");

    JobConf job = new JobConf(params, PyramidPlot.class);
    job.setJobName("PlotPyramid");

    String partition = job.get("partition", "space").toLowerCase();
    if (partition.equals("space")) {
        job.setMapperClass(SpacePartitionMap.class);
        job.setReducerClass(SpacePartitionReduce.class);
        job.setMapOutputKeyClass(TileIndex.class);
        job.setMapOutputValueClass(shape.getClass());
        job.setInputFormat(ShapeInputFormat.class);
    } else {
        job.setMapperClass(DataPartitionMap.class);
        job.setReducerClass(DataPartitionReduce.class);
        job.setMapOutputKeyClass(TileIndex.class);
        job.setMapOutputValueClass(ImageWritable.class);
        job.setInputFormat(ShapeArrayInputFormat.class);
    }

    job.setInt("color", color.getRGB());
    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(clusterStatus.getMaxMapTasks() * 5);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    if (shape instanceof Point && job.getBoolean("sample", false)) {
        // Enable adaptive sampling
        int imageWidthRoot = job.getInt("tilewidth", 256);
        int imageHeightRoot = job.getInt("tileheight", 256);
        long recordCount = FileMBR.fileMBR(inFile, params).recordCount;
        float sampleRatio = params.getFloat(GeometricPlot.AdaptiveSampleFactor, 1.0f) * imageWidthRoot
                * imageHeightRoot / recordCount;
        job.setFloat(GeometricPlot.AdaptiveSampleRatio, sampleRatio);
    }

    Rectangle fileMBR;
    if (hdfDataset != null) {
        // Input is HDF
        job.set(HDFRecordReader.DatasetName, hdfDataset);
        job.setBoolean(HDFRecordReader.SkipFillValue, true);
        job.setClass("shape", NASARectangle.class, Shape.class);
        // Determine the range of values by opening one of the HDF files
        Aggregate.MinMax minMax = Aggregate.aggregate(new Path[] { inFile }, params);
        job.setInt(MinValue, minMax.minValue);
        job.setInt(MaxValue, minMax.maxValue);
        //fileMBR = new Rectangle(-180, -90, 180, 90);
        fileMBR = plotRange != null ? plotRange.getMBR() : new Rectangle(-180, -140, 180, 169);
        //      job.setClass(HDFRecordReader.ProjectorClass, MercatorProjector.class,
        //          GeoProjector.class);
    } else {
        fileMBR = FileMBR.fileMBR(inFile, params);
    }

    boolean keepAspectRatio = params.is("keep-ratio", true);
    if (keepAspectRatio) {
        // Expand input file to a rectangle for compatibility with the pyramid
        // structure
        if (fileMBR.getWidth() > fileMBR.getHeight()) {
            fileMBR.y1 -= (fileMBR.getWidth() - fileMBR.getHeight()) / 2;
            fileMBR.y2 = fileMBR.y1 + fileMBR.getWidth();
        } else {
            fileMBR.x1 -= (fileMBR.getHeight() - fileMBR.getWidth() / 2);
            fileMBR.x2 = fileMBR.x1 + fileMBR.getHeight();
        }
    }

    SpatialSite.setRectangle(job, InputMBR, fileMBR);

    // Set input and output
    ShapeInputFormat.addInputPath(job, inFile);
    if (plotRange != null) {
        job.setClass(SpatialSite.FilterClass, RangeFilter.class, BlockFilter.class);
    }

    job.setOutputFormat(PyramidOutputFormat.class);
    TextOutputFormat.setOutputPath(job, outFile);
    job.setOutputCommitter(PlotPyramidOutputCommitter.class);

    if (background) {
        JobClient jc = new JobClient(job);
        return lastSubmittedJob = jc.submitJob(job);
    } else {
        return lastSubmittedJob = JobClient.runJob(job);
    }

}