Example usage for org.apache.hadoop.mapred JobConf setInt

List of usage examples for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value) 

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:org.terrier.utility.io.HadoopUtility.java

License:Mozilla Public License

protected static Path makeTemporaryFile(JobConf jobConf, String filename) throws IOException {
    final int randomKey = jobConf.getInt("terrier.tempfile.id", random.nextInt());
    jobConf.setInt("terrier.tempfile.id", randomKey);
    FileSystem defFS = FileSystem.get(jobConf);
    final Path tempFile = new Path(HADOOP_TMP_PATH + "/" + (randomKey) + "-" + filename);
    defFS.deleteOnExit(tempFile);//from   ww w.j av a2  s. co m
    return tempFile;
}

From source file:org.warcbase.index.IndexerRunner.java

License:Apache License

@SuppressWarnings("static-access")
public int run(String[] args) throws IOException, ParseException {
    LOG.info("Initializing indexer...");

    Options options = new Options();

    options.addOption(//from  w w w. j  av a 2s. c  o m
            OptionBuilder.withArgName("file").hasArg().withDescription("input file list").create(INPUT_OPTION));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("HDFS index output path")
            .create(INDEX_OPTION));
    options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of shards")
            .create(SHARDS_OPTION));
    options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("config file (optional)")
            .create(CONFIG_OPTION));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();
    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)
            || !cmdline.hasOption(SHARDS_OPTION)) {
        HelpFormatter formatter = new HelpFormatter();
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String configPath = null;
    if (cmdline.hasOption(CONFIG_OPTION)) {
        configPath = cmdline.getOptionValue(CONFIG_OPTION);
    }

    String inputPath = cmdline.getOptionValue(INPUT_OPTION);
    String outputPath = cmdline.getOptionValue(INDEX_OPTION);
    int shards = Integer.parseInt(cmdline.getOptionValue(SHARDS_OPTION));

    JobConf conf = new JobConf(getConf(), IndexerRunner.class);

    if (configPath == null) {
        LOG.info("Config not specified, using default src/main/solr/WARCIndexer.conf");
        configPath = "src/main/solr/WARCIndexer.conf";
    }
    File configFile = new File(configPath);
    if (!configFile.exists()) {
        LOG.error("Error: config does not exist!");
        System.exit(-1);
    }
    Config config = ConfigFactory.parseFile(configFile);
    conf.set(CONFIG_PROPERTIES, config.withOnlyPath("warc").root().render(ConfigRenderOptions.concise()));

    FileSystem fs = FileSystem.get(conf);

    LOG.info("HDFS index output path: " + outputPath);
    conf.set(IndexerReducer.HDFS_OUTPUT_PATH, outputPath);
    if (fs.exists(new Path(outputPath))) {
        LOG.error("Error: path exists already!");
        System.exit(-1);
    }

    LOG.info("Number of shards: " + shards);
    conf.setInt(IndexerMapper.NUM_SHARDS, shards);

    // Add input paths:
    LOG.info("Reading input files...");
    String line = null;
    BufferedReader br = new BufferedReader(new FileReader(inputPath));
    while ((line = br.readLine()) != null) {
        FileInputFormat.addInputPath(conf, new Path(line));
    }
    br.close();
    LOG.info("Read " + FileInputFormat.getInputPaths(conf).length + " input files.");

    conf.setJobName(IndexerRunner.class.getSimpleName() + ": " + inputPath);
    conf.setInputFormat(ArchiveFileInputFormat.class);
    conf.setMapperClass(IndexerMapper.class);
    conf.setReducerClass(IndexerReducer.class);
    conf.setOutputFormat(NullOutputFormat.class);

    // Ensure the JARs we provide take precedence over ones from Hadoop:
    conf.setBoolean("mapreduce.job.user.classpath.first", true);
    // Also set reduce speculative execution off, avoiding duplicate submissions to Solr.
    conf.setBoolean("mapreduce.reduce.speculative", false);

    // Note that we need this to ensure FileSystem.get is thread-safe:
    // @see https://issues.apache.org/jira/browse/HDFS-925
    // @see https://mail-archives.apache.org/mod_mbox/hadoop-user/201208.mbox/%3CCA+4kjVt-QE2L83p85uELjWXiog25bYTKOZXdc1Ahun+oBSJYpQ@mail.gmail.com%3E
    conf.setBoolean("fs.hdfs.impl.disable.cache", true);

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WritableSolrRecord.class);
    conf.setNumReduceTasks(shards); // number of reducers = number of shards

    cacheSolrHome(conf, solrHomeZipName);

    JobClient.runJob(conf);

    return 0;
}

From source file:org.weikey.terasort.TeraSort.java

License:Apache License

@SuppressWarnings("deprecation")
public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    SortConfig sortConfig = new SortConfig(job);
    // if (args.length >= 3) {
    // job.setNumReduceTasks(Integer.valueOf(args[2]));
    // if (args.length >= 4) {
    // sortConfig.setStartKey(Integer.valueOf(args[3]));
    // if (args.length >= 5) {
    // sortConfig.setFieldSeparator(args[4]);
    // }//from  w  ww . j  a  va 2  s.  c o  m
    // }
    // }

    Integer numMapTasks = null;
    Integer numReduceTasks = null;

    List<String> otherArgs = new ArrayList<String>();
    boolean createLzopIndex = false;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-m".equals(args[i])) {
                job.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                job.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-f".equals(args[i]) || "--ignore-case".equals(args[i])) {
                sortConfig.setIgnoreCase(true);
            } else if ("-u".equals(args[i]) || "--unique".equals(args[i])) {
                sortConfig.setUnique(true);
            } else if ("-k".equals(args[i]) || "--key".equals(args[i])) {
                String[] parts = StringUtils.split(args[++i], ",");
                sortConfig.setStartKey(Integer.valueOf(parts[0]));
                if (parts.length > 1) {
                    sortConfig.setEndKey(Integer.valueOf(parts[1]));
                }
            } else if ("-t".equals(args[i]) || "--field-separator".equals(args[i])) {
                sortConfig.setFieldSeparator(args[++i]);
            } else if ("--total-order".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits) {
                    maxSplits = Integer.MAX_VALUE;
                }
            } else if ("--lzop-index".equals(args[i])) {
                createLzopIndex = true;
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }

    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    JobClient.runJob(job);
    LOG.info("done");
    return 0;
}

From source file:pathmerge.linear.MergePathH1Driver.java

License:Apache License

public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer,
        int mergeRound, String defaultConfPath) throws IOException {

    JobConf conf = new JobConf(MergePathH1Driver.class);
    conf.setInt("sizeKmer", sizeKmer);

    if (defaultConfPath != null) {
        conf.addResource(new Path(defaultConfPath));
    }//from   w  w  w. ja  v a2 s.c om
    conf.setJobName("Initial Path-Starting-Points Table");
    conf.setMapperClass(SNodeInitialMapper.class);
    conf.setReducerClass(SNodeInitialReducer.class);

    conf.setMapOutputKeyClass(Kmer.class);
    conf.setMapOutputValueClass(MergePathValueWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    String singlePointPath = "comSinglePath0";

    MultipleOutputs.addNamedOutput(conf, singlePointPath, MergePathMultiSeqOutputFormat.class,
            VKmerBytesWritable.class, MergePathValueWritable.class);

    conf.setOutputKeyClass(VKmerBytesWritable.class);
    conf.setOutputValueClass(MergePathValueWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(inputPath + "stepNext"));
    conf.setNumReduceTasks(numReducers);
    FileSystem dfs = FileSystem.get(conf);
    dfs.delete(new Path(inputPath + "stepNext"), true);
    JobClient.runJob(conf);
    dfs.rename(new Path(inputPath + "stepNext" + "/" + singlePointPath),
            new Path(mergeResultPath + "/" + singlePointPath));
    int iMerge = 0;
    /*----------------------------------------------------------------------*/
    for (iMerge = 1; iMerge <= mergeRound; iMerge++) {
        //            if (!dfs.exists(new Path(inputPath + "-step1")))
        //                break;
        conf = new JobConf(MergePathH1Driver.class);
        conf.setInt("sizeKmer", sizeKmer);
        conf.setInt("iMerge", iMerge);

        if (defaultConfPath != null) {
            conf.addResource(new Path(defaultConfPath));
        }
        conf.setJobName("Path Merge");

        conf.setMapperClass(MergePathH1Mapper.class);
        conf.setReducerClass(MergePathH1Reducer.class);

        conf.setMapOutputKeyClass(VKmerBytesWritable.class);
        conf.setMapOutputValueClass(MergePathValueWritable.class);

        conf.setInputFormat(SequenceFileInputFormat.class);

        String uncompSinglePath = "uncompSinglePath" + iMerge;
        String comSinglePath = "comSinglePath" + iMerge;
        String comCircle = "comCircle" + iMerge;

        MultipleOutputs.addNamedOutput(conf, uncompSinglePath, MergePathMultiSeqOutputFormat.class,
                VKmerBytesWritable.class, MergePathValueWritable.class);

        MultipleOutputs.addNamedOutput(conf, comSinglePath, MergePathMultiSeqOutputFormat.class,
                VKmerBytesWritable.class, MergePathValueWritable.class);

        MultipleOutputs.addNamedOutput(conf, comCircle, MergePathMultiSeqOutputFormat.class,
                VKmerBytesWritable.class, MergePathValueWritable.class);

        conf.setOutputKeyClass(VKmerBytesWritable.class);
        conf.setOutputValueClass(MergePathValueWritable.class);

        FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext"));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));
        conf.setNumReduceTasks(numReducers);
        dfs.delete(new Path(outputPath), true);
        JobClient.runJob(conf);
        dfs.delete(new Path(inputPath + "stepNext"), true);
        dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext"));
        dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath));
        dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle));
    }
}

From source file:pathmerge.log.MergePathH2Driver.java

License:Apache License

public void run(String inputPath, String outputPath, String mergeResultPath, int numReducers, int sizeKmer,
        int mergeRound, String defaultConfPath) throws IOException {

    JobConf conf = new JobConf(MergePathH2Driver.class);
    conf.setInt("sizeKmer", sizeKmer);

    if (defaultConfPath != null) {
        conf.addResource(new Path(defaultConfPath));
    }//from   ww  w. j a v a2s.com
    conf.setJobName("Initial Path-Starting-Points Table");
    conf.setMapperClass(SNodeInitialMapper.class);
    conf.setReducerClass(SNodeInitialReducer.class);

    conf.setMapOutputKeyClass(Kmer.class);
    conf.setMapOutputValueClass(MergePathValueWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    String singlePointPath = "comSinglePath0";

    MultipleOutputs.addNamedOutput(conf, singlePointPath, MergePathMultiSeqOutputFormat.class,
            VKmerBytesWritable.class, MergePathValueWritable.class);

    conf.setOutputKeyClass(VKmerBytesWritable.class);
    conf.setOutputValueClass(MergePathValueWritable.class);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(inputPath + "stepNext"));
    conf.setNumReduceTasks(numReducers);
    FileSystem dfs = FileSystem.get(conf);
    dfs.delete(new Path(inputPath + "stepNext"), true);
    JobClient.runJob(conf);
    dfs.rename(new Path(inputPath + "stepNext" + "/" + singlePointPath),
            new Path(mergeResultPath + "/" + singlePointPath));

    int iMerge = 0;
    for (iMerge = 1; iMerge <= mergeRound; iMerge++) {
        //            if (!dfs.exists(new Path(inputPath + "-step1")))
        //                break;
        conf = new JobConf(MergePathH2Driver.class);
        conf.setInt("sizeKmer", sizeKmer);
        conf.setInt("iMerge", iMerge);

        if (defaultConfPath != null) {
            conf.addResource(new Path(defaultConfPath));
        }
        conf.setJobName("Path Merge");

        conf.setMapperClass(MergePathH2Mapper.class);
        conf.setReducerClass(MergePathH2Reducer.class);

        conf.setMapOutputKeyClass(VKmerBytesWritable.class);
        conf.setMapOutputValueClass(MergePathValueWritable.class);

        conf.setInputFormat(SequenceFileInputFormat.class);

        String uncompSinglePath = "uncompSinglePath" + iMerge;
        String comSinglePath = "comSinglePath" + iMerge;
        String comCircle = "comCircle" + iMerge;

        MultipleOutputs.addNamedOutput(conf, uncompSinglePath, MergePathMultiSeqOutputFormat.class,
                VKmerBytesWritable.class, MergePathValueWritable.class);

        MultipleOutputs.addNamedOutput(conf, comSinglePath, MergePathMultiSeqOutputFormat.class,
                VKmerBytesWritable.class, MergePathValueWritable.class);

        MultipleOutputs.addNamedOutput(conf, comCircle, MergePathMultiSeqOutputFormat.class,
                VKmerBytesWritable.class, MergePathValueWritable.class);

        conf.setOutputKeyClass(VKmerBytesWritable.class);
        conf.setOutputValueClass(MergePathValueWritable.class);

        FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext"));
        FileOutputFormat.setOutputPath(conf, new Path(outputPath));
        conf.setNumReduceTasks(numReducers);
        dfs.delete(new Path(outputPath), true);
        JobClient.runJob(conf);
        dfs.delete(new Path(inputPath + "stepNext"), true);
        dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext"));
        dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath));
        dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle));
    }
    /*        conf = new JobConf(MergePathH2Driver.class);
            conf.setInt("sizeKmer", sizeKmer);
            conf.setInt("iMerge", iMerge);
            
            if (defaultConfPath != null) {
    conf.addResource(new Path(defaultConfPath));
            }
            conf.setJobName("Path Merge");
            
            conf.setMapperClass(MergePathH2Mapper.class);
            conf.setReducerClass(MergePathH2Reducer.class);
            
            conf.setMapOutputKeyClass(VKmerBytesWritable.class);
            conf.setMapOutputValueClass(MergePathValueWritable.class);
            
            conf.setInputFormat(SequenceFileInputFormat.class);
            
            String uncompSinglePath = "uncompSinglePath" + iMerge;
            String comSinglePath = "comSinglePath" + iMerge;
            String comCircle = "comCircle" + iMerge;
            
            MultipleOutputs.addNamedOutput(conf, uncompSinglePath, MergePathMultiTextOutputFormat.class,
        VKmerBytesWritable.class, MergePathValueWritable.class);
            
            MultipleOutputs.addNamedOutput(conf, comSinglePath, MergePathMultiTextOutputFormat.class,
        VKmerBytesWritable.class, MergePathValueWritable.class);
            
            MultipleOutputs.addNamedOutput(conf, comCircle, MergePathMultiTextOutputFormat.class,
        VKmerBytesWritable.class, MergePathValueWritable.class);
            
            conf.setOutputKeyClass(VKmerBytesWritable.class);
            conf.setOutputValueClass(MergePathValueWritable.class);
            
            FileInputFormat.setInputPaths(conf, new Path(inputPath + "stepNext"));
            FileOutputFormat.setOutputPath(conf, new Path(outputPath));
            conf.setNumReduceTasks(numReducers);
            dfs.delete(new Path(outputPath), true);
            JobClient.runJob(conf);
            dfs.delete(new Path(inputPath + "stepNext"), true);
            dfs.rename(new Path(outputPath + "/" + uncompSinglePath), new Path(inputPath + "stepNext"));
            dfs.rename(new Path(outputPath + "/" + comSinglePath), new Path(mergeResultPath + "/" + comSinglePath));
            dfs.rename(new Path(outputPath + "/" + comCircle), new Path(mergeResultPath + "/" + comCircle));*/
}

From source file:redpoll.examples.sogou.SogouTermDriver.java

License:Apache License

public static void runJob(String input, String output, String analyzerName, int dfLimit) throws IOException {

    JobClient client = new JobClient();
    JobConf conf = new JobConf(SogouTermDriver.class);

    FileSystem fs = FileSystem.get(conf);
    Path outPath = new Path(output);
    if (fs.exists(outPath)) {
        fs.delete(outPath, true);//w  w  w . j a va2  s.  c o  m
    }

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(TermWritable.class);
    FileInputFormat.setInputPaths(conf, new Path(input));
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.set("redpoll.text.analyzer", analyzerName);
    conf.setInt("redpoll.text.df.limit", dfLimit);

    conf.setMapperClass(TermMapper.class);
    conf.setReducerClass(TermReducer.class);
    conf.setInputFormat(SogouInputFormat.class);
    conf.setOutputFormat(TermOutputFormat.class);
    client.setConf(conf);

    try {
        JobClient.runJob(conf);
    } catch (IOException e) {
        LOG.error(e.toString());
    }
}

From source file:redpoll.text.TfIdfDriver.java

License:Apache License

/**
 * Run the job/*w w  w .j  a  v  a2 s  . c  om*/
 * 
 * @param input the input pathname String
 * @param output the output pathname String
 */
public static void runJob(String input, String output) throws IOException {
    JobClient client = new JobClient();
    JobConf conf = new JobConf(TfIdfDriver.class);

    FileSystem fs = FileSystem.get(conf);
    Path inPath = new Path(input + "/tf");
    FileInputFormat.setInputPaths(conf, inPath);
    Path outPath = new Path(output);
    FileOutputFormat.setOutputPath(conf, outPath);

    conf.setMapperClass(TfIdfMapper.class);
    conf.setReducerClass(TfIdfReducer.class);
    //conf.setNumMapTasks(10);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(TfIdfWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(TfIdfOutputFormat.class);

    conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization,"
            + "org.apache.hadoop.io.serializer.WritableSerialization");
    // serialize a term hashmap. Its key is the term , value is a term index of
    // the term vector.    
    Path dfpath = new Path(input + "/df/part-00000");
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, dfpath, conf);
    Text key = new Text();
    IntWritable value = new IntWritable();
    HashMap<String, Integer> termMap = new HashMap<String, Integer>();
    int index = 0;
    while ((reader.next(key, value))) {
        String termString = key.toString();
        if (!termString.equals("redpoll.docs.num")) {
            termMap.put(key.toString(), index);
            index++;
        } else {
            conf.setInt("redpoll.docs.num", value.get());
        }
    }
    reader.close();
    DefaultStringifier<HashMap<String, Integer>> mapStringifier = new DefaultStringifier<HashMap<String, Integer>>(
            conf, GenericsUtil.getClass(termMap));
    String termMapString = mapStringifier.toString(termMap);
    conf.setInt("redpoll.text.terms.num", index); // number of terms
    conf.set("redpoll.text.terms", termMapString);

    client.setConf(conf);
    JobClient.runJob(conf);
}

From source file:source.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();

    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraInputFormat.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", getOutputReplication(job));
    TeraOutputFormat.setFinalSync(job, true);
    JobClient.runJob(job);/*from   w  ww. jav a2 s  .  co m*/
    LOG.info("done");
    return 0;
}

From source file:uk.bl.wa.hadoop.indexer.mdx.WARCMDXGeneratorIntegrationTest.java

License:Open Source License

@SuppressWarnings("deprecation")
@Test/*ww w.j  av a  2s.c om*/
public void testMDXGenerator() throws Exception {
    // prepare for test
    // createTextInputFile();

    log.info("Checking input file is present...");
    // Check that the input file is present:
    Path[] inputFiles = FileUtil.stat2Paths(
            getFileSystem().listStatus(new Path(input, "gov.uk-revisit-warcs/"), new OutputLogFilter()));
    Assert.assertEquals(2, inputFiles.length);
    // Create a file of the inputs
    File tmpInputsFile = writeInputFile(inputFiles);

    // Set up arguments for the job:
    String[] args = { "-i", tmpInputsFile.getAbsolutePath(), "-o", this.output.getName() };

    // Set up the WARCIndexerRunner
    WARCMDXGenerator wir = new WARCMDXGenerator();

    // run job
    // Job configuration:
    log.info("Setting up job config...");
    JobConf jobConf = this.mrCluster.createJobConf();
    jobConf.setInt(WARCMDXGenerator.WARC_HADOOP_NUM_REDUCERS, 1);
    jobConf.set("mapred.child.java.opts", "-Xmx512m");
    wir.createJobConf(jobConf, args);
    log.info("Running job...");
    JobClient.runJob(jobConf);
    log.info("Job finished, checking the results...");

    // check the output exists
    Path[] outputFiles = FileUtil.stat2Paths(getFileSystem().listStatus(output, new OutputLogFilter()));
    // Default is 1 reducers (as knitting together multiple sequence files
    // is not a mere matter of concatentation):
    Assert.assertEquals(1, outputFiles.length);

    // Copy the output out of HDFS and onto local FS:
    FileOutputStream fout = new FileOutputStream(outputSeq);
    for (Path output : outputFiles) {
        log.info(" --- output : " + output);
        if (getFileSystem().isFile(output)) {
            InputStream is = getFileSystem().open(output);
            IOUtils.copy(is, fout);
        } else {
            log.info(" --- ...skipping directory...");
        }
        fout.flush();
    }
    fout.close();

    // Check contents of the output:
    Configuration config = new Configuration();
    Path path = new Path(outputSeq.getAbsolutePath());
    SequenceFile.Reader reader = new SequenceFile.Reader(FileSystem.get(config), path, config);
    WritableComparable key = (WritableComparable) reader.getKeyClass().newInstance();
    Writable value = (Writable) reader.getValueClass().newInstance();

    MDX mdx;
    int counter = 0;
    while (reader.next(key, value)) {
        mdx = new MDX(value.toString());
        System.out.println(
                "Key is: " + key + " record_type: " + mdx.getRecordType() + " SURT: " + mdx.getUrlAsSURT());
        counter++;
    }
    assertEquals(114, counter);
    reader.close();

    // Now test the MDXSeqMerger
    testSeqMerger(outputFiles);
}

From source file:voldemort.store.readonly.benchmark.GenerateData.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 3)
        Utils.croak("USAGE: GenerateData input-file output-dir value-size");
    JobConf conf = new JobConf(getConf(), GenerateData.class);
    conf.setJobName("generate-data");

    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(GenerateDataMapper.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setNumReduceTasks(0);/*from www. ja  va 2  s.  c  o m*/

    conf.setInputFormat(TextInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(BytesWritable.class);
    conf.setOutputValueClass(BytesWritable.class);

    Path inputPath = new Path(args[0]);
    FileInputFormat.setInputPaths(conf, inputPath);
    Path outputPath = new Path(args[1]);
    // delete output path if it already exists
    FileSystem fs = outputPath.getFileSystem(conf);
    if (fs.exists(outputPath))
        fs.delete(outputPath, true);
    FileOutputFormat.setOutputPath(conf, outputPath);
    conf.setInt("value.size", Integer.parseInt(args[2]));

    JobClient.runJob(conf);
    return 0;
}