Example usage for org.apache.hadoop.mapred JobConf setInt

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setInt.

Prototype

public void setInt(String name, int value)

Source Link

Document

Set the value of the name property to an int.

Usage

From source file:org.hxx.hadoop.GeneratorMapHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
        boolean filter, boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment: " + segment);

    JobConf job = new NutchJob(getConf());
    job.setJarByClass(GeneratorMapHbase.class);
    job.setJobName("generate: from " + table + " "
            + (new SimpleDateFormat("yyyyMMdd HH:mm:ss")).format(System.currentTimeMillis()));
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (numLists == -1) {
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }//  w w w .  ja  v a  2 s .  c o m
    numLists = 4;// TODO
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCENUM, numLists);

    job.setInputFormat(TableTopInputFormat.class);// ?
    job.setMapperClass(GenerateMark.class);// generate?

    job.setPartitionerClass(URLCountPartitioner.class);
    job.setNumReduceTasks(numLists);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = null;
    try {
        r = JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }
    return r;
}

From source file:org.hxx.hadoop.GeneratorRedHbase.java

License:Apache License

private RunningJob generateJob(String table, Path segment, int numLists, long topN, long curTime,
        boolean filter, boolean norm, boolean force) throws IOException {
    LOG.info("Generator: segment=" + segment);

    JobConf job = new NutchJob(getConf());
    job.setJarByClass(GeneratorRedHbase.class);
    job.setJobName("generate: from " + table + " "
            + (new SimpleDateFormat("MMdd HH:mm:ss")).format(System.currentTimeMillis()));
    // job.setLong(HConstants.HBASE_CLIENT_SCANNER_TIMEOUT_PERIOD, 300000);

    if (numLists == -1) {
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }/* ww w .ja  v a 2  s . co m*/
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    // job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.set(GENERATL_TABLE, table);
    job.setInt(GENERATL_REDUCENUM, numLists);
    job.setInt("partition.url.seed", new Random().nextInt());

    job.setInputFormat(CodeInputFormat.class);
    job.setNumMapTasks(1);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setReducerClass(GenerateMark.class);
    job.setNumReduceTasks(numLists);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(CrawlDatum.class);
    job.setOutputKeyComparatorClass(HashComparator.class);
    Path output = new Path(segment, CrawlDatum.GENERATE_DIR_NAME);
    FileOutputFormat.setOutputPath(job, output);

    RunningJob r = null;
    try {
        r = JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }
    return r;
}

From source file:org.hxx.hadoop.URLCountPartitioner.java

License:Apache License

public static void main(String[] args) {
    URLCountPartitioner urlCountPartitioner = new URLCountPartitioner();
    JobConf job = new JobConf();

    // job.set("generate.cnt",
    // "partTotal:20000,10.200.4.136:2319,10.200.6.207:20001,10.200.4.141:9408,10.200.5.85:18004,10.200.7.13:10266,10.200.7.225:20002");
    job.set("generate.cnt",
            "partTotal:20000,10.200.4.136:953,10.200.6.207:20001,10.200.4.141:8578,10.200.7.22:20001,10.200.7.13:20001,10.200.7.225:10466");
    job.setInt("generate.reduceNum", 4);

    urlCountPartitioner.configure(job);//w  ww  . j av a2  s.  c om
}

From source file:org.jd.copier.mapred.DistCp.java

License:Apache License

/**
 * Initialize DFSCopyFileMapper specific job-configuration.
 * @param conf : The dfs/mapred configuration.
 * @param jobConf : The handle to the jobConf object to be initialized.
 * @param args Arguments/*  w w  w.  j  a  v a  2s  . c  om*/
 * @return true if it is necessary to launch a job.
 */
private static boolean setup(Configuration conf, JobConf jobConf, final Arguments args) throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    //set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean skipCRCCheck = args.flags.contains(Options.SKIPCRC);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.SKIPCRC.propertyname, skipCRCCheck);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname,
            args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path stagingArea;
    try {
        stagingArea = JobSubmissionFiles.getStagingDir(jClient, conf);
    } catch (InterruptedException e) {
        throw new IOException(e);
    }

    Path jobDirectory = new Path(stagingArea + NAME + "_" + randomId);
    FsPermission mapredSysPerms = new FsPermission(JobSubmissionFiles.JOB_DIR_PERMISSION);
    FileSystem.mkdirs(jClient.getFs(), jobDirectory, mapredSysPerms);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP);

    FileSystem dstfs = args.dst.getFileSystem(conf);

    // get tokens for all the required FileSystems..
    TokenCache.obtainTokensForNamenodes(jobConf.getCredentials(), new Path[] { args.dst }, conf);

    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
        dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
        String filename = "_distcp_logs_" + randomId;
        if (!dstExists || !dstIsDir) {
            Path parent = args.dst.getParent();
            if (null == parent) {
                // If dst is '/' on S3, it might not exist yet, but dst.getParent()
                // will return null. In this case, use '/' as its own parent to prevent
                // NPE errors below.
                parent = args.dst;
            }
            if (!dstfs.exists(parent)) {
                dstfs.mkdirs(parent);
            }
            logPath = new Path(parent, filename);
        } else {
            logPath = new Path(args.dst, filename);
        }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class,
            Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
        for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) {
            final Path src = srcItr.next();
            FileSystem srcfs = src.getFileSystem(conf);
            FileStatus srcfilestat = srcfs.getFileStatus(src);
            Path root = special && srcfilestat.isDir() ? src : src.getParent();
            if (srcfilestat.isDir()) {
                ++srcCount;
            }

            Stack<FileStatus> pathstack = new Stack<FileStatus>();
            for (pathstack.push(srcfilestat); !pathstack.empty();) {
                FileStatus cur = pathstack.pop();
                FileStatus[] children = srcfs.listStatus(cur.getPath());
                for (int i = 0; i < children.length; i++) {
                    boolean skipfile = false;
                    final FileStatus child = children[i];
                    final String dst = makeRelative(root, child.getPath());
                    ++srcCount;

                    if (child.isDir()) {
                        pathstack.push(child);
                    } else {
                        //skip file if the src and the dst files are the same.
                        skipfile = update
                                && sameFile(srcfs, child, dstfs, new Path(args.dst, dst), skipCRCCheck);
                        //skip file if it exceed file limit or size limit
                        skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

                        if (!skipfile) {
                            ++fileCount;
                            byteCount += child.getLen();

                            if (LOG.isTraceEnabled()) {
                                LOG.trace("adding file " + child.getPath());
                            }

                            ++cnsyncf;
                            cbsyncs += child.getLen();
                            if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) {
                                src_writer.sync();
                                dst_writer.sync();
                                cnsyncf = 0;
                                cbsyncs = 0L;
                            }
                        }
                    }

                    if (!skipfile) {
                        src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()),
                                new FilePair(child, dst));
                    }

                    dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
                }

                if (cur.isDir()) {
                    String dst = makeRelative(root, cur.getPath());
                    dir_writer.append(new Text(dst), new FilePair(cur, dst));
                    if (++dirsyn > SYNC_FILE_MAX) {
                        dirsyn = 0;
                        dir_writer.sync();
                    }
                }
            }
        }
    } finally {
        checkAndClose(src_writer);
        checkAndClose(dst_writer);
        checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
        dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
        LOG.info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
        if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
            throw new IOException("Failed to create" + args.dst);
        }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
        deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir = new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());

    // Explicitly create the tmpDir to ensure that it can be cleaned
    // up by fullyDelete() later.
    tmpDir.getFileSystem(conf).mkdirs(tmpDir);

    LOG.info("sourcePathsCount=" + srcCount);
    LOG.info("filesToCopyCount=" + fileCount);
    LOG.info("bytesToCopyCount=" + StringUtils.humanReadableInt(byteCount));
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
    return fileCount > 0;
}

From source file:org.mitre.bio.mapred.Fasta2SequenceFile.java

License:Open Source License

@Override
public int run(String[] args) throws Exception {
    JobConf conf = new JobConf(getConf());
    boolean cleanLogs = false;

    List<String> other_args = new ArrayList<String>();
    for (int i = 0; i < args.length; ++i) {
        try {//from   w w  w  .j  av a  2  s .  c o m
            if ("-m".equals(args[i])) {
                conf.setNumMapTasks(Integer.parseInt(args[++i]));
            } else if ("-r".equals(args[i])) {
                conf.setNumReduceTasks(Integer.parseInt(args[++i]));
            } else if ("-c".equals(args[i])) {
                cleanLogs = true;
            } else if ("-n".equals(args[i])) {
                conf.setInt(HEADER_FORMAT, Integer.parseInt(args[++i]));
            } else {
                other_args.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage();
        }
    }
    // Make sure there are exactly 2 parameters left.
    if (other_args.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + other_args.size() + " instead of 2.");
        return printUsage();
    }

    return initJob(conf, other_args.get(0), other_args.get(1), cleanLogs);
}

From source file:org.mitre.ccv.mapred.CalculateCosineDistanceMatrix.java

License:Open Source License

public int initJob(JobConf jobConf, String input, String output) throws Exception {
    JobConf conf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);

    final Path inputPath = new Path(input);
    final FileSystem fs = inputPath.getFileSystem(conf);
    final Path qInputPath = fs.makeQualified(inputPath);

    /**//from  w  w  w .j  a v  a  2  s . c  o  m
     * Need to get all of the sample names/labels
     */
    JobConf cacheConf = new JobConf(jobConf, CalculateCosineDistanceMatrix.class);
    cacheConf.setJobName("CacheNorm2MapReduce");
    cacheConf.setNumReduceTasks(1); // Want ONE part file

    // Set up IdentityMapper
    SequenceFileInputFormat.setInputPaths(cacheConf, new Path(input));
    cacheConf.setInputFormat(SequenceFileInputFormat.class);
    cacheConf.setMapperClass(Norm2Mapper.class);
    cacheConf.setOutputKeyClass(StringDoublePairWritable.class);
    cacheConf.setOutputValueClass(SparseVectorWritable.class);

    // Set up IdentityReducer
    cacheConf.setReducerClass(IdentityReducer.class);
    cacheConf.setOutputFormat(SequenceFileOutputFormat.class);
    cacheConf.setNumReduceTasks(1);
    Path sfPath = FileUtils.createRemoteTempPath(fs, qInputPath.getParent());
    LOG.info(String.format("Generating feature vector SequenceFile path %s", sfPath.toString()));
    SequenceFileOutputFormat.setOutputPath(cacheConf, sfPath);
    JobClient.runJob(cacheConf);

    Path cachePath = new Path(sfPath.toString() + Path.SEPARATOR + "part-00000");

    // need to know the size (the reducer might be able to send this back via the Reporter, but how do we grab that info?
    StringDoublePairWritable key = new StringDoublePairWritable();
    int size = 0;
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, cachePath, conf);
    boolean hasNext = reader.next(key);
    while (hasNext) {
        size += 1;
        hasNext = reader.next(key);
    }
    try {
        reader.close();
    } catch (IOException ioe) {
        // closing the SequenceFile.Reader will throw an exception if the file is over some unknown size
        LOG.debug("Probably caused by closing the SequenceFile.Reader. All is well", ioe);
    }

    //LOG.info(String.format("Caching model file %s", qInputPath.toString()));
    URI listURI = new URI(fs.makeQualified(cachePath).toString());
    DistributedCache.addCacheFile(listURI, conf);
    LOG.info(String.format("SequenceFile cache path %s (%s) with %d labels", listURI.toString(),
            cachePath.getName(), size));
    conf.set(CACHE_PATH, cachePath.getName());
    conf.setInt(DISTANCE_MATRIX_SIZE, size);

    /**
     * Main MapReduce Task of generating dot products
     */
    LOG.info("Generating distances");
    JobConf distanceConf = new JobConf(conf, CalculateCosineDistanceMatrix.class);
    distanceConf.setJobName("DistanceMapReduce");
    // Set up distance mapper
    SequenceFileInputFormat.setInputPaths(distanceConf, new Path(input));
    distanceConf.setInputFormat(SequenceFileInputFormat.class);
    distanceConf.setMapperClass(DistanceMap.class);
    distanceConf.setMapOutputKeyClass(Text.class);
    distanceConf.setMapOutputValueClass(SparseVectorWritable.class);

    // Set up reducer to merge lower-triangle results into a single dense distance vector
    distanceConf.setReducerClass(DistanceReducer.class);
    distanceConf.setOutputKeyClass(Text.class);
    distanceConf.setOutputValueClass(DenseVectorWritable.class);
    distanceConf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(distanceConf, new Path(output));
    JobClient.runJob(distanceConf);

    return 0;
}

From source file:org.mitre.ccv.mapred.GenerateFeatureVectors.java

License:Open Source License

/**
 * Start a new job with the given configuration and parameters.
 *
 * @param jobConf/*  w w  w.ja  va  2s  .co m*/
 * @param listInput         file path containing list of k-mers to use
 * @param cardinality       number of k-mers to use (if list contains less,then that will be used instead).
 * @param input             composition vector {@link SequenceFile} such as generated by {@link CalculateCompositionVectors}
 * @param output
 * @param cleanLogs
 * @return zero if no errors
 * @throws java.lang.Exception
 */
public int initJob(JobConf jobConf, String listInput, Integer cardinality, String input, String output,
        boolean cleanLogs) throws Exception {
    JobConf conf = new JobConf(jobConf, GenerateFeatureVectors.class);
    conf.setJobName("GenerateFeatureVectors");

    Path listPath = new Path(listInput); // i.e, listInput = win32_200902260829/kmer_120811a7fa1_tmp
    FileSystem fs = listPath.getFileSystem(conf);
    if (listInput != null) {
        // @todo: should check to see if it is there!

        // It doesn't say it, but we need the quailifed path with the host name
        // otherwise URI sticks the host on to it not so nicely
        Path qPath = fs.makeQualified(listPath);
        // listPath = hdfs://rocks5.local:54310/user/mcolosimo/win32_200902260829/kmer_120811a7fa1_tmp
        LOG.info(String.format("Caching k-mer file %s", qPath.toString()));
        // URI:hdfs://rocks5.local:54310/user/mcolosimo/win32_200902260829/kmer_120811a7fa1_tmp
        URI listURI = new URI(qPath.toString());
        DistributedCache.addCacheFile(listURI, conf);
        conf.set(KMER_LIST, listPath.getName());
        //LOG.info("k-mer URI:" + listURI.toString());
    } else {
        throw new Exception("GenerateFeatureVectors requires a list of k-mers!");
    }

    /** We need this. It is okay if the cardinality is larger than the number of k-mers. */
    if (cardinality == null) {
        LOG.info("Scanning k-mer file to determine cardinality");
        FSDataInputStream ins = fs.open(listPath);

        KmerEntropyPairWritable w = new KmerEntropyPairWritable();
        int c = 0;
        while (ins.available() > 0) {
            w.readFields(ins);
            c++;
        }
        ins.close();
        fs.close();
        LOG.info(String.format("Found %d k-mers in the file", c));
        cardinality = c;
    }
    conf.setInt(VECTOR_CARDINALITY, cardinality);

    // Set up mapper
    SequenceFileInputFormat.setInputPaths(conf, new Path(input));
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapperClass(CompositionVectorMap.class);
    conf.setOutputKeyClass(Text.class); // final output key class - sample name
    conf.setOutputValueClass(SparseVectorWritable.class); // final output value class

    // Set up combiner/reducer
    conf.setReducerClass(Features2VectorReducer.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(conf, new Path(output));

    JobClient.runJob(conf);

    return 0;
}

From source file:org.smartfrog.services.hadoop.benchmark.citerank.CiteRankTool.java

License:Open Source License

/**
 * Create a configuration bound to this class, with various options set up
 *
 * @return the job// w ww  .j  a  v  a2  s . com
 */
protected JobConf createConfiguration() {
    JobConf conf = new JobConf(getConf(), this.getClass());
    conf.setJobName(getName());
    conf.setJar(getJarName());
    conf.setNumMapTasks(NUM_MAP_TASKS);

    conf.setInt("dfs.replication", NUM_REPLICAS);
    return conf;
}

From source file:org.smartfrog.services.hadoop.mapreduce.terasort.TeraSortJob.java

License:Apache License

@SuppressWarnings("ProhibitedExceptionDeclared")
@Override/*ww w  .  ja v a  2  s  .com*/
public int run(String[] args) throws Exception {
    LOG.info("starting");
    JobConf job = (JobConf) getConf();
    Path inputDir = new Path(args[0]);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    Path partitionFile = new Path(inputDir, TeraConstants.PARTITION_FILENAME);
    URI partitionUri = new URI(partitionFile.toString() + "#" + TeraConstants.PARTITION_FILENAME);
    TeraInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSortJob.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormat(TeraInputFormat.class);
    job.setOutputFormat(TeraOutputFormat.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setBoolean(ClusterConstants.MAPRED_DISABLE_TOOL_WARNING, true);

    TeraInputFormat.writePartitionFile(job, partitionFile);
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);
    job.setInt("dfs.replication", 1);
    job.setInt("mapred.submit.replication", 1);
    TeraOutputFormat.setFinalSync(job, true);
    RunningJob runningJob = JobClient.runJob(job);
    LOG.info("done");
    return 0;
}

From source file:org.terrier.structures.indexing.CompressingMetaIndexBuilder.java

License:Mozilla Public License

/**
 * reverseAsMapReduceJob/*from w  ww .  ja va 2  s. c o m*/
 * @param index
 * @param structureName
 * @param keys
 * @param jf
 * @throws Exception
 */
//@SuppressWarnings("deprecation")
public static void reverseAsMapReduceJob(IndexOnDisk index, String structureName, String[] keys,
        HadoopPlugin.JobFactory jf) throws Exception {
    long time = System.currentTimeMillis();
    final JobConf conf = jf.newJob();
    conf.setJobName("Reverse MetaIndex");
    conf.setMapOutputKeyClass(KeyValueTuple.class);
    conf.setMapOutputValueClass(IntWritable.class);
    conf.setMapperClass(MapperReducer.class);
    conf.setReducerClass(MapperReducer.class);
    conf.setNumReduceTasks(keys.length);
    conf.setPartitionerClass(KeyedPartitioner.class);
    conf.setInputFormat(CompressingMetaIndexInputFormat.class);
    conf.setReduceSpeculativeExecution(false);
    conf.set("MetaIndexInputStreamRecordReader.structureName", structureName);
    conf.setInt("CompressingMetaIndexBuilder.reverse.keyCount", keys.length);
    conf.set("CompressingMetaIndexBuilder.reverse.keys", ArrayUtils.join(keys, ","));
    conf.set("CompressingMetaIndexBuilder.forward.valueLengths",
            index.getIndexProperty("index." + structureName + ".value-lengths", ""));
    conf.set("CompressingMetaIndexBuilder.forward.keys",
            index.getIndexProperty("index." + structureName + ".key-names", ""));
    FileOutputFormat.setOutputPath(conf, new Path(index.getPath()));
    HadoopUtility.toHConfiguration(index, conf);

    conf.setOutputFormat(NullOutputFormat.class);
    try {
        RunningJob rj = JobClient.runJob(conf);
        rj.getID();
        HadoopUtility.finishTerrierJob(conf);
    } catch (Exception e) {
        throw new Exception("Problem running job to reverse metadata", e);
    }
    //only update the index from the controlling process, so that we dont have locking/concurrency issues
    index.setIndexProperty("index." + structureName + ".reverse-key-names", ArrayUtils.join(keys, ","));
    index.flush();
    logger.info("Time Taken = " + ((System.currentTimeMillis() - time) / 1000) + " seconds");
}