Example usage for org.apache.hadoop.mapred JobConf setLong

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setLong.

Prototype

public void setLong(String name, long value)

Source Link

Document

Set the value of the name property to a long.

Usage

From source file:net.peacesoft.nutch.crawl.ReInjector.java

License:Apache License

public void inject(Path crawlDb, Path urlDir) {
    try {//from   ww w  . j a  va 2  s .c o m
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: starting at " + sdf.format(start));
            LOG.info("Injector: crawlDb: " + crawlDb);
            LOG.info("Injector: urlDir: " + urlDir);
        }

        Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
                + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

        // map text input file to a <url,CrawlDatum> file
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Converting injected urls to crawl db entries.");
        }
        JobConf sortJob = new NutchJob(getConf());
        sortJob.setJobName("inject " + urlDir);
        FileInputFormat.addInputPath(sortJob, urlDir);
        sortJob.setMapperClass(ReInjector.InjectMapper.class);

        FileOutputFormat.setOutputPath(sortJob, tempDir);
        sortJob.setOutputFormat(SequenceFileOutputFormat.class);
        sortJob.setOutputKeyClass(Text.class);
        sortJob.setOutputValueClass(CrawlDatum.class);
        sortJob.setLong("injector.current.time", System.currentTimeMillis());
        JobClient.runJob(sortJob);

        // merge with existing crawl db
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Merging injected urls into crawl db.");
        }
        JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
        FileInputFormat.addInputPath(mergeJob, tempDir);
        mergeJob.setReducerClass(ReInjector.InjectReducer.class);
        JobClient.runJob(mergeJob);
        CrawlDb.install(mergeJob, crawlDb);

        // clean up
        FileSystem fs = FileSystem.get(getConf());
        fs.delete(tempDir, true);

        long end = System.currentTimeMillis();
        LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception ex) {
        LOG.error("ReInjector run injector error: " + ex.toString(), ex);
    }
}

From source file:nl.tudelft.graphalytics.mapreducev2.evo.DirectedForestFireModelJob.java

License:Apache License

@Override
protected void setConfigurationParameters(JobConf jobConfiguration) {
    super.setConfigurationParameters(jobConfiguration);
    jobConfiguration.setLong(ForestFireModelUtils.MAX_ID, getParameters().getMaxId() + 1);
    jobConfiguration.setFloat(ForestFireModelUtils.P_RATIO, getParameters().getPRatio());
    jobConfiguration.setFloat(ForestFireModelUtils.R_RATIO, getParameters().getRRatio());
    jobConfiguration.set(ForestFireModelUtils.CURRENT_AMBASSADORS,
            ForestFireModelUtils.verticesIDsMap2String(burnedEdges));

    if (getIteration() == 1) {
        if (getNumMappers() > 0) {
            jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR,
                    getParameters().getNumNewVertices() / getNumMappers());
            jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, getNumMappers());
        } else {//from   w  w  w  .  ja v  a 2 s .c  o  m
            jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR, getParameters().getNumNewVertices());
            jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, 1024 * 1024);
        }
        jobConfiguration.setBoolean(ForestFireModelUtils.IS_INIT, true);
    }
}

From source file:nl.tudelft.graphalytics.mapreducev2.evo.UndirectedForestFireModelJob.java

License:Apache License

@Override
protected void setConfigurationParameters(JobConf jobConfiguration) {
    super.setConfigurationParameters(jobConfiguration);
    jobConfiguration.setLong(ForestFireModelUtils.MAX_ID, getParameters().getMaxId() + 1);
    jobConfiguration.setFloat(ForestFireModelUtils.P_RATIO, getParameters().getPRatio());
    jobConfiguration.setFloat(ForestFireModelUtils.R_RATIO, getParameters().getRRatio());
    jobConfiguration.set(ForestFireModelUtils.CURRENT_AMBASSADORS,
            ForestFireModelUtils.verticesIDsMap2String(burnedEdges));

    if (getIteration() == 1) {
        if (getNumMappers() > 0) {
            jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR,
                    getParameters().getNumNewVertices() / getNumMappers());
            jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, getNumMappers());
        } else {/*  ww  w .j a v a 2  s. c o m*/
            jobConfiguration.setInt(ForestFireModelUtils.NEW_VERTICES_NR, getParameters().getNumNewVertices());
            jobConfiguration.setInt(ForestFireModelUtils.ID_SHIFT, 1024 * 1024);
        }
        jobConfiguration.setBoolean(ForestFireModelUtils.IS_INIT, true);
    } else if (getIteration() == getParameters().getMaxIterations() + 1) {
        jobConfiguration.setBoolean(ForestFireModelUtils.IS_FINAL, true);
    }
}

From source file:nthu.scopelab.tsqr.ssvd.itQJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k,
        int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException {

    String stages[] = reduceSchedule.split(",");
    String rinput = "";
    String routput = outputPath + "/iter-r-";

    for (int i = 0; i < stages.length; i++) {
        String thenumber = Integer.toString(i + 1);
        JobConf job = new JobConf(conf, itQJob.class);
        job.setJobName("itQ-job-" + thenumber);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        if (i == 0)
            job.setMapperClass(QMapper.class);
        else//from  w w  w  .  java 2 s.c om
            job.setMapperClass(IdentityMapper.class);

        job.setReducerClass(QReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LMatrixWritable.class);

        FileSystem fs = FileSystem.get(job);
        Path Paths[];
        fileGather fgather = null;
        if (i == 0)
            fgather = new fileGather(inputPaths, "part", fs);
        else
            fgather = new fileGather(new Path(rinput), "part", fs);
        Paths = fgather.getPaths();
        mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
        job.setNumMapTasks(fgather.recNumMapTasks(mis));

        job.setNumReduceTasks(Integer.parseInt(stages[i]));

        job.setInt(QRFirstJob.COLUMN_SIZE, k + p);
        job.setLong(PROP_OMEGA_SEED, seed);
        job.setInt(PROP_K, k);
        job.setInt(PROP_P, p);

        fs.delete(new Path(routput + thenumber), true);

        FileInputFormat.setInputPaths(job, Paths);

        FileOutputFormat.setOutputPath(job, new Path(routput + thenumber));

        //FileOutputFormat.setCompressOutput(job, true);
        //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
        //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);
        //output first level Q
        MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class,
                LMatrixWritable.class);

        RunningJob rj = JobClient.runJob(job);
        System.out.println("itQJob Job ID: " + rj.getJobID().toString());
        rinput = routput + thenumber;
    }
}

From source file:nthu.scopelab.tsqr.ssvd.QJob.java

License:Apache License

public static void run(Configuration conf, Path[] inputPaths, String outputPath, String reduceSchedule, int k,
        int p, long seed, int mis) throws ClassNotFoundException, InterruptedException, IOException {

    String stages[] = reduceSchedule.split(",");
    String rinput = "";
    String routput = outputPath + "/iter-r-";

    for (int i = 0; i < stages.length; i++) {
        String thenumber = Integer.toString(i + 1);
        JobConf job = new JobConf(conf, QJob.class);
        job.setJobName("Q-job-" + thenumber);
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setOutputFormat(SequenceFileOutputFormat.class);

        if (i == 0)
            job.setMapperClass(QMapper.class);
        else//from  w  ww. j a v  a 2 s  .  co  m
            job.setMapperClass(IdentityMapper.class);

        job.setReducerClass(QReducer.class);
        job.setOutputKeyClass(IntWritable.class);
        job.setOutputValueClass(LMatrixWritable.class);

        FileSystem fs = FileSystem.get(job);
        Path Paths[];
        fileGather fgather = null;
        if (i == 0)
            fgather = new fileGather(inputPaths, "part", fs);
        else
            fgather = new fileGather(new Path(rinput), "part", fs);
        Paths = fgather.getPaths();
        mis = Checker.checkMis(mis, fgather.getInputSize(), fs);
        job.setNumMapTasks(fgather.recNumMapTasks(mis));

        job.setNumReduceTasks(Integer.parseInt(stages[i]));

        job.setInt(QRFirstJob.COLUMN_SIZE, k + p);
        job.setLong(PROP_OMEGA_SEED, seed);
        job.setInt(PROP_K, k);
        job.setInt(PROP_P, p);

        fs.delete(new Path(routput + thenumber), true);

        FileInputFormat.setInputPaths(job, Paths);

        FileOutputFormat.setOutputPath(job, new Path(routput + thenumber));

        //FileOutputFormat.setCompressOutput(job, true);
        //FileOutputFormat.setOutputCompressorClass(job, DefaultCodec.class);
        //SequenceFileOutputFormat.setOutputCompressionType(job,CompressionType.BLOCK);
        //output first level Q
        MultipleOutputs.addNamedOutput(job, QF_MAT, SequenceFileOutputFormat.class, IntWritable.class,
                LMatrixWritable.class);

        RunningJob rj = JobClient.runJob(job);
        System.out.println("QJob Job ID: " + rj.getJobID().toString());
        rinput = routput + thenumber;
    }
}

From source file:org.apache.nutch.crawl.CrawlDbReader.java

License:Apache License

public void processTopNJob(String crawlDb, long topN, float min, String output, Configuration config)
        throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: starting (topN=" + topN + ", min=" + min + ")");
        LOG.info("CrawlDb db: " + crawlDb);
    }//  w w w .ja  v  a2  s . co m

    Path outFolder = new Path(output);
    Path tempDir = new Path(config.get("mapred.temp.dir", ".") + "/readdb-topN-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("topN prepare " + crawlDb);
    FileInputFormat.addInputPath(job, new Path(crawlDb, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(CrawlDbTopNMapper.class);
    job.setReducerClass(IdentityReducer.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    // XXX hmmm, no setFloat() in the API ... :(
    job.setLong("db.reader.topn.min", Math.round(1000000.0 * min));
    JobClient.runJob(job);

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: collecting topN scores.");
    }
    job = new NutchJob(config);
    job.setJobName("topN collect " + crawlDb);
    job.setLong("db.reader.topn", topN);

    FileInputFormat.addInputPath(job, tempDir);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(CrawlDbTopNReducer.class);

    FileOutputFormat.setOutputPath(job, outFolder);
    job.setOutputFormat(TextOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputValueClass(Text.class);

    job.setNumReduceTasks(1); // create a single file.

    JobClient.runJob(job);
    FileSystem fs = FileSystem.get(config);
    fs.delete(tempDir, true);
    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb topN: done");
    }

}

From source file:org.apache.nutch.crawl.Generator.java

License:Apache License

/**
 * Generate fetchlists in one or more segments. Whether to filter URLs or not
 * is read from the crawl.generate.filter property in the configuration files.
 * If the property is not found, the URLs are filtered. Same for the
 * normalisation./* w  w  w .j a  v a 2  s . co  m*/
 * 
 * @param dbDir
 *          Crawl database directory
 * @param segments
 *          Segments directory
 * @param numLists
 *          Number of reduce tasks
 * @param topN
 *          Number of top URLs to be selected
 * @param curTime
 *          Current time in milliseconds
 * 
 * @return Path to generated segment or null if no entries were selected
 * 
 * @throws IOException
 *           When an I/O error occurs
 */
public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter,
        boolean norm, boolean force, int maxNumSegments) throws IOException {

    Path tempDir = new Path(
            getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

    Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("Generator: starting at " + sdf.format(start));
    LOG.info("Generator: Selecting best-scoring urls due for fetch.");
    LOG.info("Generator: filtering: " + filter);
    LOG.info("Generator: normalizing: " + norm);
    if (topN != Long.MAX_VALUE) {
        LOG.info("Generator: topN: " + topN);
    }

    if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) {
        LOG.info("Generator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead");
    }

    // map to inverted subset due for fetch, sort by score
    JobConf job = new NutchJob(getConf());
    job.setJobName("generate: select from " + dbDir);

    if (numLists == -1) { // for politeness make
        numLists = job.getNumMapTasks(); // a partition per fetch task
    }
    if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) {
        // override
        LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
        numLists = 1;
    }
    job.setLong(GENERATOR_CUR_TIME, curTime);
    // record real generation time
    long generateTime = System.currentTimeMillis();
    job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
    job.setLong(GENERATOR_TOP_N, topN);
    job.setBoolean(GENERATOR_FILTER, filter);
    job.setBoolean(GENERATOR_NORMALISE, norm);
    job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments);

    FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(Selector.class);
    job.setPartitionerClass(Selector.class);
    job.setReducerClass(Selector.class);

    FileOutputFormat.setOutputPath(job, tempDir);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(FloatWritable.class);
    job.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
    job.setOutputValueClass(SelectorEntry.class);
    job.setOutputFormat(GeneratorOutputFormat.class);

    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        throw e;
    }

    // read the subdirectories generated in the temp
    // output and turn them into segments
    List<Path> generatedSegments = new ArrayList<Path>();

    FileStatus[] status = fs.listStatus(tempDir);
    try {
        for (FileStatus stat : status) {
            Path subfetchlist = stat.getPath();
            if (!subfetchlist.getName().startsWith("fetchlist-"))
                continue;
            // start a new partition job for this segment
            Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists);
            generatedSegments.add(newSeg);
        }
    } catch (Exception e) {
        LOG.warn("Generator: exception while partitioning segments, exiting ...");
        fs.delete(tempDir, true);
        return null;
    }

    if (generatedSegments.size() == 0) {
        LOG.warn("Generator: 0 records selected for fetching, exiting ...");
        LockUtil.removeLockFile(fs, lock);
        fs.delete(tempDir, true);
        return null;
    }

    if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
        // update the db from tempDir
        Path tempDir2 = new Path(
                getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());

        job = new NutchJob(getConf());
        job.setJobName("generate: updatedb " + dbDir);
        job.setLong(Nutch.GENERATE_TIME_KEY, generateTime);
        for (Path segmpaths : generatedSegments) {
            Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, subGenDir);
        }
        FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME));
        job.setInputFormat(SequenceFileInputFormat.class);
        job.setMapperClass(CrawlDbUpdater.class);
        job.setReducerClass(CrawlDbUpdater.class);
        job.setOutputFormat(MapFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(CrawlDatum.class);
        FileOutputFormat.setOutputPath(job, tempDir2);
        try {
            JobClient.runJob(job);
            CrawlDb.install(job, dbDir);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(tempDir, true);
            fs.delete(tempDir2, true);
            throw e;
        }
        fs.delete(tempDir2, true);
    }

    LockUtil.removeLockFile(fs, lock);
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Generator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));

    Path[] patharray = new Path[generatedSegments.size()];
    return generatedSegments.toArray(patharray);
}

From source file:org.apache.nutch.crawl.Injector.java

License:Apache License

public void inject(Path crawlDb, Path urlDir) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: starting at " + sdf.format(start));
        LOG.info("Injector: crawlDb: " + crawlDb);
        LOG.info("Injector: urlDir: " + urlDir);
    }/*from  w  w w  .  j  av a 2  s  .  co  m*/

    Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/inject-temp-"
            + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // map text input file to a <url,CrawlDatum> file
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: Converting injected urls to crawl db entries.");
    }
    JobConf sortJob = new NutchJob(getConf());
    sortJob.setJobName("inject " + urlDir);
    FileInputFormat.addInputPath(sortJob, urlDir);
    sortJob.setMapperClass(InjectMapper.class);

    FileOutputFormat.setOutputPath(sortJob, tempDir);
    sortJob.setOutputFormat(SequenceFileOutputFormat.class);
    sortJob.setOutputKeyClass(Text.class);
    sortJob.setOutputValueClass(CrawlDatum.class);
    sortJob.setLong("injector.current.time", System.currentTimeMillis());
    JobClient.runJob(sortJob);

    // merge with existing crawl db
    if (LOG.isInfoEnabled()) {
        LOG.info("Injector: Merging injected urls into crawl db.");
    }
    JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb);
    FileInputFormat.addInputPath(mergeJob, tempDir);
    mergeJob.setReducerClass(InjectReducer.class);
    JobClient.runJob(mergeJob);
    CrawlDb.install(mergeJob, crawlDb);

    // clean up
    FileSystem fs = FileSystem.get(getConf());
    fs.delete(tempDir, true);

    long end = System.currentTimeMillis();
    LOG.info("Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java

License:Apache License

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN/*from  ww w .jav a  2s  . c om*/
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType,
        AggrType aggrType, boolean asSequenceFile) throws Exception {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Configuration conf = getConf();

    JobConf dumper = new NutchJob(conf);
    dumper.setJobName("NodeDumper: " + webGraphDb);
    FileInputFormat.addInputPath(dumper, nodeDb);
    dumper.setInputFormat(SequenceFileInputFormat.class);

    if (nameType == null) {
        dumper.setMapperClass(Sorter.class);
        dumper.setReducerClass(Sorter.class);
        dumper.setMapOutputKeyClass(FloatWritable.class);
        dumper.setMapOutputValueClass(Text.class);
    } else {
        dumper.setMapperClass(Dumper.class);
        dumper.setReducerClass(Dumper.class);
        dumper.setMapOutputKeyClass(Text.class);
        dumper.setMapOutputValueClass(FloatWritable.class);
    }

    dumper.setOutputKeyClass(Text.class);
    dumper.setOutputValueClass(FloatWritable.class);
    FileOutputFormat.setOutputPath(dumper, output);

    if (asSequenceFile) {
        dumper.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        dumper.setOutputFormat(TextOutputFormat.class);
    }

    dumper.setNumReduceTasks(1);
    dumper.setBoolean("inlinks", type == DumpType.INLINKS);
    dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
    dumper.setBoolean("scores", type == DumpType.SCORES);

    dumper.setBoolean("host", nameType == NameType.HOST);
    dumper.setBoolean("domain", nameType == NameType.DOMAIN);
    dumper.setBoolean("sum", aggrType == AggrType.SUM);
    dumper.setBoolean("max", aggrType == AggrType.MAX);

    dumper.setLong("topn", topN);

    // Set equals-sign as separator for Solr's ExternalFileField
    if (asEff) {
        dumper.set("mapred.textoutputformat.separator", "=");
    }

    try {
        LOG.info("NodeDumper: running");
        JobClient.runJob(dumper);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.segment.SegmentMerger.java

License:Apache License

public void merge(Path out, Path[] segs, boolean filter, boolean normalize, long slice) throws Exception {
    String segmentName = Generator.generateSegmentName();
    if (LOG.isInfoEnabled()) {
        LOG.info("Merging " + segs.length + " segments to " + out + "/" + segmentName);
    }/* ww w.  ja  va 2 s . c o  m*/
    JobConf job = new NutchJob(getConf());
    job.setJobName("mergesegs " + out + "/" + segmentName);
    job.setBoolean("segment.merger.filter", filter);
    job.setBoolean("segment.merger.normalizer", normalize);
    job.setLong("segment.merger.slice", slice);
    job.set("segment.merger.segmentName", segmentName);
    FileSystem fs = FileSystem.get(getConf());
    // prepare the minimal common set of input dirs
    boolean g = true;
    boolean f = true;
    boolean p = true;
    boolean c = true;
    boolean pd = true;
    boolean pt = true;
    for (int i = 0; i < segs.length; i++) {
        if (!fs.exists(segs[i])) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Input dir " + segs[i] + " doesn't exist, skipping.");
            }
            segs[i] = null;
            continue;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("SegmentMerger:   adding " + segs[i]);
        }
        Path cDir = new Path(segs[i], Content.DIR_NAME);
        Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
        Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
        Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
        Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
        Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
        c = c && fs.exists(cDir);
        g = g && fs.exists(gDir);
        f = f && fs.exists(fDir);
        p = p && fs.exists(pDir);
        pd = pd && fs.exists(pdDir);
        pt = pt && fs.exists(ptDir);
    }
    StringBuffer sb = new StringBuffer();
    if (c)
        sb.append(" " + Content.DIR_NAME);
    if (g)
        sb.append(" " + CrawlDatum.GENERATE_DIR_NAME);
    if (f)
        sb.append(" " + CrawlDatum.FETCH_DIR_NAME);
    if (p)
        sb.append(" " + CrawlDatum.PARSE_DIR_NAME);
    if (pd)
        sb.append(" " + ParseData.DIR_NAME);
    if (pt)
        sb.append(" " + ParseText.DIR_NAME);
    if (LOG.isInfoEnabled()) {
        LOG.info("SegmentMerger: using segment data from:" + sb.toString());
    }
    for (int i = 0; i < segs.length; i++) {
        if (segs[i] == null)
            continue;
        if (g) {
            Path gDir = new Path(segs[i], CrawlDatum.GENERATE_DIR_NAME);
            FileInputFormat.addInputPath(job, gDir);
        }
        if (c) {
            Path cDir = new Path(segs[i], Content.DIR_NAME);
            FileInputFormat.addInputPath(job, cDir);
        }
        if (f) {
            Path fDir = new Path(segs[i], CrawlDatum.FETCH_DIR_NAME);
            FileInputFormat.addInputPath(job, fDir);
        }
        if (p) {
            Path pDir = new Path(segs[i], CrawlDatum.PARSE_DIR_NAME);
            FileInputFormat.addInputPath(job, pDir);
        }
        if (pd) {
            Path pdDir = new Path(segs[i], ParseData.DIR_NAME);
            FileInputFormat.addInputPath(job, pdDir);
        }
        if (pt) {
            Path ptDir = new Path(segs[i], ParseText.DIR_NAME);
            FileInputFormat.addInputPath(job, ptDir);
        }
    }
    job.setInputFormat(ObjectInputFormat.class);
    job.setMapperClass(SegmentMerger.class);
    job.setReducerClass(SegmentMerger.class);
    FileOutputFormat.setOutputPath(job, out);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(MetaWrapper.class);
    job.setOutputFormat(SegmentOutputFormat.class);

    setConf(job);

    JobClient.runJob(job);
}