Example usage for org.apache.hadoop.mapred JobConf setLong

List of usage examples for org.apache.hadoop.mapred JobConf setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setLong.

Prototype

public void setLong(String name, long value) 

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:edu.umn.cs.spatialHadoop.operations.Within.java

License:Open Source License

public static <S extends Shape> long within(Path[] inFiles, Path userOutputPath, OperationsParams params)
        throws IOException, InterruptedException {
    JobConf job = new JobConf(params, Within.class);

    LOG.info("Within journey starts ....");
    FileSystem inFs = inFiles[0].getFileSystem(job);
    Path outputPath = userOutputPath;
    if (outputPath == null) {
        FileSystem outFs = FileSystem.get(job);
        do {//  w  w  w . j  av  a2 s  . c o  m
            outputPath = new Path(inFiles[0].getName() + ".sjmr_" + (int) (Math.random() * 1000000));
        } while (outFs.exists(outputPath));
    }
    FileSystem outFs = outputPath.getFileSystem(job);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setJobName("Within");
    job.setMapperClass(WithinMap.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(IndexedText.class);
    job.setNumMapTasks(5 * Math.max(1, clusterStatus.getMaxMapTasks()));
    job.setLong("mapred.min.split.size", Math.max(inFs.getFileStatus(inFiles[0]).getBlockSize(),
            inFs.getFileStatus(inFiles[1]).getBlockSize()));

    job.setReducerClass(WithinReduce.class);
    job.setNumReduceTasks(Math.max(1, clusterStatus.getMaxReduceTasks()));

    job.setInputFormat(ShapeLineInputFormat.class);
    if (job.getBoolean("output", true))
        job.setOutputFormat(TextOutputFormat.class);
    else
        job.setOutputFormat(NullOutputFormat.class);
    ShapeLineInputFormat.setInputPaths(job, inFiles);

    // Calculate and set the dimensions of the grid to use in the map phase
    long total_size = 0;
    Rectangle mbr = new Rectangle(Double.MAX_VALUE, Double.MAX_VALUE, -Double.MAX_VALUE, -Double.MAX_VALUE);
    for (Path file : inFiles) {
        FileSystem fs = file.getFileSystem(params);
        Rectangle file_mbr = FileMBR.fileMBR(file, params);
        mbr.expand(file_mbr);
        total_size += FileUtil.getPathSize(fs, file);
    }
    // If the largest file is globally indexed, use its partitions
    total_size += total_size * job.getFloat(SpatialSite.INDEXING_OVERHEAD, 0.2f);
    int sjmrPartitioningGridFactor = params.getInt(PartitioiningFactor, 20);
    int num_cells = (int) Math.max(1,
            total_size * sjmrPartitioningGridFactor / outFs.getDefaultBlockSize(outputPath));
    LOG.info("Number of cells is configured to be " + num_cells);

    OperationsParams.setInactiveModeFlag(job, InactiveMode, isReduceInactive);
    OperationsParams.setJoiningThresholdPerOnce(job, JoiningThresholdPerOnce, joiningThresholdPerOnce);
    OperationsParams.setFilterOnlyModeFlag(job, isFilterOnlyMode, isFilterOnly);

    GridInfo gridInfo = new GridInfo(mbr.x1, mbr.y1, mbr.x2, mbr.y2);
    gridInfo.calculateCellDimensions(num_cells);
    OperationsParams.setShape(job, PartitionGrid, gridInfo);

    TextOutputFormat.setOutputPath(job, outputPath);

    if (OperationsParams.isLocal(job, inFiles)) {
        // Enforce local execution if explicitly set by user or for small files
        job.set("mapred.job.tracker", "local");
    }

    // Start the job
    RunningJob runningJob = JobClient.runJob(job);
    Counters counters = runningJob.getCounters();
    Counter outputRecordCounter = counters.findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS);
    final long resultCount = outputRecordCounter.getValue();

    return resultCount;
}

From source file:edu.umn.cs.spatialHadoop.temporal.RepartitionTemporal.java

License:Apache License

public static void repartitionMapReduce(Path[] inputPaths, Path outputPath, Shape stockShape, long blockSize,
        CellInfo[] cellInfos, String sindex, boolean overwrite) throws IOException {

    JobConf job = new JobConf(Repartition.class);

    job.setJobName("RepartitionTemporal");
    FileSystem outFs = outputPath.getFileSystem(job);

    // Overwrite output file
    if (outFs.exists(outputPath)) {
        if (overwrite)
            outFs.delete(outputPath, true);
        else//from  w ww .j  a  va2s  .c  o  m
            throw new RuntimeException(
                    "Output file '" + outputPath + "' already exists and overwrite flag is not set");
    }

    // Decide which map function to use depending on the type of global
    // index
    if (sindex.equals("rtree") || sindex.equals("str")) {
        // Repartition without replication
        job.setMapperClass(RepartitionMapNoReplication.class);
    } else {
        // Repartition with replication (grid and r+tree)
        job.setMapperClass(RepartitionMap.class);
    }
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(stockShape.getClass());
    CombinedSpatialInputFormat.setInputPaths(job, inputPaths);
    job.setInputFormat(CombinedSpatialInputFormat.class);

    ClusterStatus clusterStatus = new JobClient(job).getClusterStatus();
    job.setNumMapTasks(10 * Math.max(1, clusterStatus.getMaxMapTasks()));

    FileOutputFormat.setOutputPath(job, outputPath);
    if (sindex.equals("grid") || sindex.equals("str") || sindex.equals("str+")) {
        job.setOutputFormat(GridOutputFormat.class);
    } else if (sindex.equals("rtree") || sindex.equals("r+tree")) {
        // For now, the two types of local index are the same
        job.setOutputFormat(RTreeGridOutputFormat.class);
    } else {
        throw new RuntimeException("Unsupported spatial index: " + sindex);
    }

    SpatialSite.setCells(job, cellInfos);
    job.setBoolean(SpatialSite.OVERWRITE, overwrite);

    // Set reduce function
    job.setReducerClass(RepartitionReduce.class);
    job.setNumReduceTasks(
            Math.max(1, Math.min(cellInfos.length, (clusterStatus.getMaxReduceTasks() * 9 + 5) / 10)));

    // Set output committer that combines output files together
    job.setOutputCommitter(RepartitionOutputCommitter.class);

    if (blockSize != 0) {
        job.setLong("dfs.block.size", blockSize);
        job.setLong("fs.local.block.size", blockSize);
    }

    JobClient.runJob(job);
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mgmt.hadoop.DistCp.java

License:LGPL

/**
 * Initialize DFSCopyFileMapper specific job-configuration.
 * @param conf : The dfs/mapred configuration.
 * @param jobConf : The handle to the jobConf object to be initialized.
 * @param args Arguments/*from   w w w .ja v a  2 s  . c  om*/
 */
private static void setup(final Configuration conf, final JobConf jobConf, final Arguments args)
        throws IOException {
    jobConf.set(DST_DIR_LABEL, args.dst.toUri().toString());

    // set boolean values
    final boolean update = args.flags.contains(Options.UPDATE);
    final boolean overwrite = !update && args.flags.contains(Options.OVERWRITE);
    jobConf.setBoolean(Options.UPDATE.propertyname, update);
    jobConf.setBoolean(Options.OVERWRITE.propertyname, overwrite);
    jobConf.setBoolean(Options.IGNORE_READ_FAILURES.propertyname,
            args.flags.contains(Options.IGNORE_READ_FAILURES));
    jobConf.setBoolean(Options.PRESERVE_STATUS.propertyname, args.flags.contains(Options.PRESERVE_STATUS));

    final String randomId = getRandomId();
    JobClient jClient = new JobClient(jobConf);
    Path jobDirectory = new Path(jClient.getSystemDir(), NAME + "_" + randomId);
    jobConf.set(JOB_DIR_LABEL, jobDirectory.toString());

    long maxBytesPerMap = conf.getLong(BYTES_PER_MAP_LABEL, BYTES_PER_MAP);

    FileSystem dstfs = args.dst.getFileSystem(conf);
    boolean dstExists = dstfs.exists(args.dst);
    boolean dstIsDir = false;
    if (dstExists) {
        dstIsDir = dstfs.getFileStatus(args.dst).isDir();
    }

    // default logPath
    Path logPath = args.log;
    if (logPath == null) {
        String filename = "_distcp_logs_" + randomId;
        if (!dstExists || !dstIsDir) {
            Path parent = args.dst.getParent();
            if (null == parent) {
                // If dst is '/' on S3, it might not exist yet, but dst.getParent()
                // will return null. In this case, use '/' as its own parent to
                // prevent
                // NPE errors below.
                parent = args.dst;
            }
            if (!dstfs.exists(parent)) {
                dstfs.mkdirs(parent);
            }
            logPath = new Path(parent, filename);
        } else {
            logPath = new Path(args.dst, filename);
        }
    }
    FileOutputFormat.setOutputPath(jobConf, logPath);

    // create src list, dst list
    FileSystem jobfs = jobDirectory.getFileSystem(jobConf);

    Path srcfilelist = new Path(jobDirectory, "_distcp_src_files");
    jobConf.set(SRC_LIST_LABEL, srcfilelist.toString());
    SequenceFile.Writer src_writer = SequenceFile.createWriter(jobfs, jobConf, srcfilelist, LongWritable.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    Path dstfilelist = new Path(jobDirectory, "_distcp_dst_files");
    SequenceFile.Writer dst_writer = SequenceFile.createWriter(jobfs, jobConf, dstfilelist, Text.class,
            Text.class, SequenceFile.CompressionType.NONE);

    Path dstdirlist = new Path(jobDirectory, "_distcp_dst_dirs");
    jobConf.set(DST_DIR_LIST_LABEL, dstdirlist.toString());
    SequenceFile.Writer dir_writer = SequenceFile.createWriter(jobfs, jobConf, dstdirlist, Text.class,
            FilePair.class, SequenceFile.CompressionType.NONE);

    // handle the case where the destination directory doesn't exist
    // and we've only a single src directory OR we're updating/overwriting
    // the contents of the destination directory.
    final boolean special = (args.srcs.size() == 1 && !dstExists) || update || overwrite;
    int srcCount = 0, cnsyncf = 0, dirsyn = 0;
    long fileCount = 0L, byteCount = 0L, cbsyncs = 0L;
    try {
        for (Iterator<Path> srcItr = args.srcs.iterator(); srcItr.hasNext();) {
            final Path src = srcItr.next();
            FileSystem srcfs = src.getFileSystem(conf);
            FileStatus srcfilestat = srcfs.getFileStatus(src);
            Path root = special && srcfilestat.isDir() ? src : src.getParent();
            if (srcfilestat.isDir()) {
                ++srcCount;
            }

            Stack<FileStatus> pathstack = new Stack<>();
            for (pathstack.push(srcfilestat); !pathstack.empty();) {
                FileStatus cur = pathstack.pop();
                FileStatus[] children = srcfs.listStatus(cur.getPath());
                for (int i = 0; i < children.length; i++) {
                    boolean skipfile = false;
                    final FileStatus child = children[i];
                    final String dst = makeRelative(root, child.getPath());
                    ++srcCount;

                    if (child.isDir()) {
                        pathstack.push(child);
                    } else {
                        // skip file if the src and the dst files are the same.
                        skipfile = update && sameFile(srcfs, child, dstfs, new Path(args.dst, dst));
                        // skip file if it exceed file limit or size limit
                        skipfile |= fileCount == args.filelimit || byteCount + child.getLen() > args.sizelimit;

                        if (!skipfile) {
                            ++fileCount;
                            byteCount += child.getLen();

                            // if (LOG.isTraceEnabled()) {
                            // LOG.trace("adding file " + child.getPath());
                            // }

                            ++cnsyncf;
                            cbsyncs += child.getLen();
                            if (cnsyncf > SYNC_FILE_MAX || cbsyncs > maxBytesPerMap) {
                                src_writer.sync();
                                dst_writer.sync();
                                cnsyncf = 0;
                                cbsyncs = 0L;
                            }
                        }
                    }

                    if (!skipfile) {
                        src_writer.append(new LongWritable(child.isDir() ? 0 : child.getLen()),
                                new FilePair(child, dst));
                    }

                    dst_writer.append(new Text(dst), new Text(child.getPath().toString()));
                }

                if (cur.isDir()) {
                    String dst = makeRelative(root, cur.getPath());
                    dir_writer.append(new Text(dst), new FilePair(cur, dst));
                    if (++dirsyn > SYNC_FILE_MAX) {
                        dirsyn = 0;
                        dir_writer.sync();
                    }
                }
            }
        }
    } finally {
        checkAndClose(src_writer);
        checkAndClose(dst_writer);
        checkAndClose(dir_writer);
    }

    FileStatus dststatus = null;
    try {
        dststatus = dstfs.getFileStatus(args.dst);
    } catch (FileNotFoundException fnfe) {
        getLogger().info(args.dst + " does not exist.");
    }

    // create dest path dir if copying > 1 file
    if (dststatus == null) {
        if (srcCount > 1 && !dstfs.mkdirs(args.dst)) {
            throw new IOException("Failed to create" + args.dst);
        }
    }

    final Path sorted = new Path(jobDirectory, "_distcp_sorted");
    checkDuplication(jobfs, dstfilelist, sorted, conf);

    if (dststatus != null && args.flags.contains(Options.DELETE)) {
        deleteNonexisting(dstfs, dststatus, sorted, jobfs, jobDirectory, jobConf, conf);
    }

    Path tmpDir = new Path(
            (dstExists && !dstIsDir) || (!dstExists && srcCount == 1) ? args.dst.getParent() : args.dst,
            "_distcp_tmp_" + randomId);
    jobConf.set(TMP_DIR_LABEL, tmpDir.toUri().toString());

    // Explicitly create the tmpDir to ensure that it can be cleaned
    // up by fullyDelete() later.
    tmpDir.getFileSystem(conf).mkdirs(tmpDir);

    getLogger().info("srcCount=" + srcCount);
    jobConf.setInt(SRC_COUNT_LABEL, srcCount);
    jobConf.setLong(TOTAL_SIZE_LABEL, byteCount);
    setMapCount(byteCount, jobConf);
}

From source file:hibench.HtmlConf.java

License:Apache License

public void setJobConf(JobConf job) {

    job.setInt("agents", agents);
    job.setLong("pages", pages);
    job.setLong("agentpages", agentpages);

    long vLinkElems = (null == linkZipf) ? -1 : linkZipf.velems;
    long vWordElems = (null == wordZipf) ? -1 : wordZipf.velems;

    job.setLong("vLinkElems", vLinkElems);
    job.setLong("vWordElems", vWordElems);
}

From source file:hibench.VisitConf.java

License:Apache License

public void setJobConf(JobConf job) {
    job.setInt("agents", agents);
    job.setLong("visits", visits);
    job.setLong("agentvisits", agentvisits);
    job.setLong("pages", pages);
}

From source file:hibench.ZipfRandom.java

License:Apache License

public void setJobConf(JobConf job) {
    job.setInt("agents", agents);
    job.setLong("elems", elems);
    job.setLong("agentelems", agentelems);
    job.setFloat("exponent", (float) exponent);
    job.setFloat("scale", (float) scale);
}

From source file:io.fluo.stress.trie.Generate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args.length != 4) {
        log.error("Usage: " + this.getClass().getSimpleName()
                + " <numMappers> <numbersPerMapper> <max> <output dir>");
        System.exit(-1);/*from   w w  w .j ava 2  s . co m*/
    }

    int numMappers = Integer.parseInt(args[0]);
    int numPerMapper = Integer.parseInt(args[1]);
    long max = Long.parseLong(args[2]);
    Path out = new Path(args[3]);

    Preconditions.checkArgument(numMappers > 0, "numMappers <= 0");
    Preconditions.checkArgument(numPerMapper > 0, "numPerMapper <= 0");
    Preconditions.checkArgument(max > 0, "max <= 0");

    JobConf job = new JobConf(getConf());

    job.setJobName(this.getClass().getName());

    job.setJarByClass(Generate.class);

    job.setInt(TRIE_GEN_NUM_PER_MAPPER_PROP, numPerMapper);
    job.setInt(TRIE_GEN_NUM_MAPPERS_PROP, numMappers);
    job.setLong(TRIE_GEN_MAX_PROP, max);

    job.setInputFormat(RandomLongInputFormat.class);

    job.setNumReduceTasks(0);

    job.setOutputKeyClass(LongWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputPath(job, out);

    RunningJob runningJob = JobClient.runJob(job);
    runningJob.waitForCompletion();
    return runningJob.isSuccessful() ? 0 : -1;
}

From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);
    FileSystem fs = FileSystem.get(conf);

    String collectionName = conf.get("Ivory.CollectionName");
    String indexPaths = conf.get("Ivory.IndexPaths");
    String dataOutputPath = conf.get("Ivory.DataOutputPath");
    int dfThreshold = conf.getInt("Ivory.DfThreshold", 0);

    // first, compute size of global term space
    Path tmpPaths = new Path("/tmp/index-paths.txt");

    FSDataOutputStream out = fs.create(tmpPaths, true);
    for (String s : indexPaths.split(",")) {
        out.write(new String(s + "\n").getBytes());
    }//from   w ww. j  av a 2s.co  m
    out.close();

    LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments");
    conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();

    long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS")
            .getCounter();

    LOG.info("total number of terms in global dictionary = " + totalNumTerms);

    // now build the dictionary
    fs.delete(new Path(dataOutputPath), true);

    conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);

    LOG.info("Job: MergeGlobalStatsAcrossIndexSegments");
    conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms);

    startTime = System.currentTimeMillis();
    job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // compute some # docs, collection length, avg doc length
    long collectionLength = 0;
    int docCount = 0;
    for (String index : indexPaths.split(",")) {
        LOG.info("reading stats for " + index);

        RetrievalEnvironment env = new RetrievalEnvironment(index, fs);

        long l = env.readCollectionLength();
        int n = env.readCollectionDocumentCount();

        LOG.info(" - CollectionLength: " + l);
        LOG.info(" - CollectionDocumentCount: " + n);

        collectionLength += l;
        docCount += n;
    }

    float avgdl = (float) collectionLength / docCount;

    LOG.info("all index segments: ");
    LOG.info(" - CollectionLength: " + collectionLength);
    LOG.info(" - CollectionDocumentCount: " + docCount);
    LOG.info(" - AverageDocumentLenght: " + avgdl);

    RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs);

    env.writeCollectionAverageDocumentLength(avgdl);
    env.writeCollectionLength(collectionLength);
    env.writeCollectionDocumentCount(docCount);

    return 0;
}

From source file:job.uncombine.compressed.BigBuildInvertedIndex.java

License:Apache License

/**
 * Runs this tool.//from w w w . j  ava  2  s.c  o  m
 */
public int run(String[] args) throws Exception {

    //long GB = 1024 * 1024 * 1024;
    //long totalDataSize = 1 * GB;

    int reduceNumArray[] = { 9, 18 };
    int splitSizeMBArray[] = { 64, 128, 256 };
    int xmxArray[] = { 1000, 2000, 3000, 4000 };
    int xmsArray[] = { 0, 1 };
    int ismbArray[] = { 200, 400, 600, 800 };

    for (int splitIndex = 0; splitIndex < splitSizeMBArray.length; splitIndex++) {
        for (int reduceNumIndex = 0; reduceNumIndex < reduceNumArray.length; reduceNumIndex++) {
            for (int xmxIndex = 0; xmxIndex < xmxArray.length; xmxIndex++) {
                for (int xmsIndex = 0; xmsIndex < xmsArray.length; xmsIndex++) {
                    for (int ismbIndex = 0; ismbIndex < ismbArray.length; ismbIndex++) {

                        int reduceNum = reduceNumArray[reduceNumIndex];
                        int splitMB = splitSizeMBArray[splitIndex];
                        int xmx = xmxArray[xmxIndex];
                        int xms = xmsArray[xmsIndex] * xmx;
                        int ismb = ismbArray[ismbIndex];

                        JobConf conf = new JobConf(getConf(), BigBuildInvertedIndex.class);

                        conf.setLong("mapred.min.split.size", SplitTable.getMapred_min_split_size(splitMB));
                        conf.setLong("mapred.max.split.size", SplitTable.getMapred_max_split_size(splitMB));

                        //conf.setInt("my.sample.split.num", (int) (totalDataSize / (splitMB * 1024 * 1024)));

                        conf.setInt("mapred.reduce.tasks", reduceNum);
                        conf.setInt("io.sort.mb", ismb);

                        if (xms == 0)
                            conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m");
                        else
                            conf.set("mapred.child.java.opts", "-Xmx" + xmx + "m -Xms" + xms + "m");

                        conf.setInt("child.monitor.metrics.seconds", 2);
                        conf.setInt("child.monitor.jvm.seconds", 2);
                        conf.setInt("child.monitor.jstat.seconds", 2);

                        conf.setJobName("BigBuildInvertedIndex " + splitMB + "MB "
                                + conf.get("mapred.child.java.opts") + " ismb=" + ismb + " RN=" + reduceNum);

                        String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
                        if (otherArgs.length != 2) {
                            System.err.println("Usage: BigBuildInvertedIndex <in> <out>");
                            System.exit(2);
                        }

                        conf.setMapOutputKeyClass(Text.class);
                        conf.setMapOutputValueClass(PairOfInts.class);
                        conf.setOutputKeyClass(Text.class);
                        conf.setOutputValueClass(PairOfWritables.class);
                        SequenceFileOutputFormat.setOutputCompressionType(conf, CompressionType.BLOCK);
                        conf.setOutputFormat(MapFileOutputFormat.class);

                        conf.setMapperClass(MyMapper.class);
                        // conf.setCombinerClass(IdentityReducer.class);
                        conf.setReducerClass(MyReducer.class);
                        FileInputFormat.setInputPaths(conf, new Path(otherArgs[0]));
                        FileOutputFormat.setOutputPath(conf, new Path(otherArgs[1]));

                        FileSystem.get(conf).delete(new Path(otherArgs[1]), true);

                        try {
                            JobClient.runJob(conf);
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                        Thread.sleep(15000);

                    }
                }
            }
        }
    }
    return 0;
}

From source file:net.peacesoft.nutch.crawl.ReGenerator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or
 * not is read from the crawl.generate.filter property in the configuration
 * files. If the property is not found, the URLs are filtered. Same for the
 * normalisation./*from   w w w.ja  v  a  2  s . c  o  m*/
 *
 * @param dbDir Crawl database directory
 * @param segments Segments directory
 * @param numLists Number of reduce tasks
 * @param topN Number of top URLs to be selected
 * @param curTime Current time in milliseconds
 *
 * @return Path to generated segment or null if no entries were selected
 *
 * @throws IOException When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {
    try {
        Path tempDir = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
        FileSystem fs = FileSystem.get(getConf());
        LockUtil.createLockFile(fs, lock, force);

        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        LOG.info("ReGenerator: starting at " + sdf.format(start));
        LOG.info("ReGenerator: Selecting best-scoring urls due for fetch.");
        LOG.info("ReGenerator: filtering: " + filter);
        LOG.info("ReGenerator: normalizing: " + norm);
        if (topN != Long.MAX_VALUE) {
            LOG.info("ReGenerator: topN: " + topN);
        }

        if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
            LOG.info(
                    "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
        }

        // map to inverted subset due for fetch, sort by score
        JobConf job = new NutchJob(getConf());
        job.setJobName("generate: select from " + dbDir);

        if (numLists == -1) { // for politeness make
            numLists = job.getNumMapTasks(); // a partition per fetch task
        }
        if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
            // override
            LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition.");
            numLists = 1;
        }
        job.setLong(GENERATOR_CUR_TIME, curTime);
        // record real generation time
        long generateTime = System.currentTimeMillis();
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        job.setLong(GENERATOR_TOP_N, topN);
        job.setBoolean(GENERATOR_FILTER, filter);
        job.setBoolean(GENERATOR_NORMALISE, norm);
        job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);

        job.setMapperClass(Selector.class);
        job.setPartitionerClass(Selector.class);
        job.setReducerClass(Selector.class);

        FileOutputFormat.setOutputPath(job, tempDir);
        job.setOutputFormat(SequenceFileOutputFormat.class);
        job.setOutputKeyClass(FloatWritable.class);
        job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        job.setOutputValueClass(SelectorEntry.class);
        job.setOutputFormat(GeneratorOutputFormat.class);

        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            throw e;
        }

        // read the subdirectories generated in the temp
        // output and turn them into segments
        List<Path> generatedSegments = new ArrayList<Path>();

        FileStatus[] status = fs.listStatus(tempDir);
        try {
            for (FileStatus stat : status) {
                Path subfetchlist = stat.getPath();
                if (!subfetchlist.getName().startsWith("fetchlist-")) {
                    continue;
                }
                // start a new partition job for this segment
                Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
                generatedSegments.add(newSeg);
            }
        } catch (Exception e) {
            LOG.warn("ReGenerator: exception while partitioning segments, exiting ...");
            fs.delete(tempDir, true);
            return null;
        }

        if (generatedSegments.size() == 0) {
            LOG.warn("ReGenerator: 0 records selected for fetching, exiting ...");
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            return null;
        }

        if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
            // update the db from tempDir
            Path tempDir2 = new Path(
                    getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

            job = new NutchJob(getConf());
            job.setJobName("generate: updatedb " + dbDir);
            job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
            for (Path segmpaths : generatedSegments) {
                Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
                FileInputFormat.addInputPath(job, subGenDir);
            }
            FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
            job.setInputFormat(SequenceFileInputFormat.class);
            job.setMapperClass(CrawlDbUpdater.class);
            job.setReducerClass(CrawlDbUpdater.class);
            job.setOutputFormat(MapFileOutputFormat.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(CrawlDatum.class);
            FileOutputFormat.setOutputPath(job, tempDir2);
            try {
                JobClient.runJob(job);
                CrawlDb.install(job, dbDir);
            } catch (IOException e) {
                LockUtil.removeLockFile(fs, lock);
                fs.delete(tempDir, true);
                fs.delete(tempDir2, true);
                throw e;
            }
            fs.delete(tempDir2, true);
        }

        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));

        Path[] patharray = new Path[generatedSegments.size()];
        return generatedSegments.toArray(patharray);
    } catch (Exception ex) {
        LOG.error("ReGenerator generate error: " + ex.toString(), ex);
        return null;
    }
}