Example usage for org.apache.hadoop.mapred JobConf setBoolean

List of usage examples for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value) 

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:org.apache.nutch.crawl.LinkDbMerger.java

License:Apache License

public static JobConf createMergeJob(Configuration config, Path linkDb, boolean normalize, boolean filter) {
    Path newLinkDb = new Path("linkdb-merge-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    JobConf job = new NutchJob(config);
    job.setJobName("linkdb merge " + linkDb);

    job.setInputFormat(SequenceFileInputFormat.class);

    job.setMapperClass(LinkDbFilter.class);
    job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize);
    job.setBoolean(LinkDbFilter.URL_FILTERING, filter);
    job.setReducerClass(LinkDbMerger.class);

    FileOutputFormat.setOutputPath(job, newLinkDb);
    job.setOutputFormat(MapFileOutputFormat.class);
    job.setBoolean("mapred.output.compress", true);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Inlinks.class);

    // https://issues.apache.org/jira/browse/NUTCH-1069
    job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    return job;/*from  ww  w .  j  a v a  2s. co m*/
}

From source file:org.apache.nutch.fetcher.Fetcher2.java

License:Apache License

public void fetch(Path segment, int threads, boolean parsing) throws IOException {

    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting");
        LOG.info("Fetcher: segment: " + segment);
    }//w  w w.ja  v  a  2s  .  co  m

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
    job.setBoolean("fetcher.parse", parsing);

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    job.setInputPath(new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(InputFormat.class);

    job.setMapRunnerClass(Fetcher2.class);

    job.setOutputPath(segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(FetcherOutput.class);

    JobClient.runJob(job);
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: done");
    }
}

From source file:org.apache.nutch.indexer.solr.SolrClean.java

License:Apache License

public void delete(String crawldb, String solrUrl, boolean noCommit) throws IOException {
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrClean: starting at " + sdf.format(start));

    JobConf job = new NutchJob(getConf());

    FileInputFormat.addInputPath(job, new Path(crawldb, CrawlDb.CURRENT_NAME));
    job.setBoolean("noCommit", noCommit);
    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setInputFormat(SequenceFileInputFormat.class);
    job.setOutputFormat(NullOutputFormat.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapperClass(DBFilter.class);
    job.setReducerClass(SolrDeleter.class);

    JobClient.runJob(job);//from w w w.  j  av a2 s.  c  o  m

    long end = System.currentTimeMillis();
    LOG.info("SolrClean: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.indexer.solr.SolrIndexer.java

License:Apache License

public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit,
        boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("SolrIndexer: starting at " + sdf.format(start));

    final JobConf job = new NutchJob(getConf());
    job.setJobName("index-solr " + solrUrl);

    LOG.info("SolrIndexer: deleting gone documents: " + deleteGone);
    LOG.info("SolrIndexer: URL filtering: " + filter);
    LOG.info("SolrIndexer: URL normalizing: " + normalize);

    IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job);

    job.set(SolrConstants.SERVER_URL, solrUrl);
    job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone);
    job.setBoolean(IndexerMapReduce.URL_FILTERING, filter);
    job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize);
    if (solrParams != null) {
        job.set(SolrConstants.PARAMS, solrParams);
    }//from  w  ww . j a  v a2 s .  c o m
    NutchIndexWriterFactory.addClassToConf(job, SolrWriter.class);

    job.setReduceSpeculativeExecution(false);

    final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt());

    FileOutputFormat.setOutputPath(job, tmp);
    try {
        JobClient.runJob(job);
        // do the commits once and for all the reducers in one go
        SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job);

        if (!noCommit) {
            solr.commit();
        }
        long end = System.currentTimeMillis();
        LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception e) {
        LOG.error(e.toString());
    } finally {
        FileSystem.get(job).delete(tmp, true);
    }
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the counter job. The counter job determines the number of links in the
 * webgraph. This is used during analysis.
 * //from w  ww. j  a  va2s  .com
 * @param fs The job file system.
 * @param webGraphDb The web graph database to use.
 * 
 * @return The number of nodes in the web graph.
 * @throws IOException If an error occurs while running the counter job.
 */
private int runCounter(FileSystem fs, Path webGraphDb) throws IOException {

    // configure the counter job
    Path numLinksPath = new Path(webGraphDb, NUM_NODES);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    JobConf counter = new NutchJob(getConf());
    counter.setJobName("LinkRank Counter");
    FileInputFormat.addInputPath(counter, nodeDb);
    FileOutputFormat.setOutputPath(counter, numLinksPath);
    counter.setInputFormat(SequenceFileInputFormat.class);
    counter.setMapperClass(Counter.class);
    counter.setCombinerClass(Counter.class);
    counter.setReducerClass(Counter.class);
    counter.setMapOutputKeyClass(Text.class);
    counter.setMapOutputValueClass(LongWritable.class);
    counter.setOutputKeyClass(Text.class);
    counter.setOutputValueClass(LongWritable.class);
    counter.setNumReduceTasks(1);
    counter.setOutputFormat(TextOutputFormat.class);
    counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the counter job, outputs to a single reduce task and file
    LOG.info("Starting link counter job");
    try {
        JobClient.runJob(counter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished link counter job");

    // read the first (and only) line from the file which should be the
    // number of links in the web graph
    LOG.info("Reading numlinks temp file");
    FSDataInputStream readLinks = fs.open(new Path(numLinksPath, "part-00000"));
    BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks));
    String numLinksLine = buffer.readLine();
    readLinks.close();

    // check if there are links to process, if none, webgraph might be empty
    if (numLinksLine == null || numLinksLine.length() == 0) {
        fs.delete(numLinksPath, true);
        throw new IOException("No links to process, is the webgraph empty?");
    }

    // delete temp file and convert and return the number of links as an int
    LOG.info("Deleting numlinks temp file");
    fs.delete(numLinksPath, true);
    String numLinks = numLinksLine.split("\\s+")[1];
    return Integer.parseInt(numLinks);
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the initializer job. The initializer job sets up the nodes with a
 * default starting score for link analysis.
 * //from ww  w .ja va  2 s.  c om
 * @param nodeDb The node database to use.
 * @param output The job output directory.
 * 
 * @throws IOException If an error occurs while running the initializer job.
 */
private void runInitializer(Path nodeDb, Path output) throws IOException {

    // configure the initializer
    JobConf initializer = new NutchJob(getConf());
    initializer.setJobName("LinkAnalysis Initializer");
    FileInputFormat.addInputPath(initializer, nodeDb);
    FileOutputFormat.setOutputPath(initializer, output);
    initializer.setInputFormat(SequenceFileInputFormat.class);
    initializer.setMapperClass(Initializer.class);
    initializer.setMapOutputKeyClass(Text.class);
    initializer.setMapOutputValueClass(Node.class);
    initializer.setOutputKeyClass(Text.class);
    initializer.setOutputValueClass(Node.class);
    initializer.setOutputFormat(MapFileOutputFormat.class);
    initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the initializer
    LOG.info("Starting initialization job");
    try {
        JobClient.runJob(initializer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished initialization job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the inverter job. The inverter job flips outlinks to inlinks to be
 * passed into the analysis job.//from w  w  w.j a  va 2 s  .co  m
 * 
 * The inverter job takes a link loops database if it exists. It is an
 * optional componenet of link analysis due to its extreme computational and
 * space requirements but it can be very useful is weeding out and eliminating
 * link farms and other spam pages.
 * 
 * @param nodeDb The node database to use.
 * @param outlinkDb The outlink database to use.
 * @param loopDb The loop database to use if it exists.
 * @param output The output directory.
 * 
 * @throws IOException If an error occurs while running the inverter job.
 */
private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) throws IOException {

    // configure the inverter
    JobConf inverter = new NutchJob(getConf());
    inverter.setJobName("LinkAnalysis Inverter");
    FileInputFormat.addInputPath(inverter, nodeDb);
    FileInputFormat.addInputPath(inverter, outlinkDb);

    // add the loop database if it exists, isn't null
    if (loopDb != null) {
        FileInputFormat.addInputPath(inverter, loopDb);
    }
    FileOutputFormat.setOutputPath(inverter, output);
    inverter.setInputFormat(SequenceFileInputFormat.class);
    inverter.setMapperClass(Inverter.class);
    inverter.setReducerClass(Inverter.class);
    inverter.setMapOutputKeyClass(Text.class);
    inverter.setMapOutputValueClass(ObjectWritable.class);
    inverter.setOutputKeyClass(Text.class);
    inverter.setOutputValueClass(LinkDatum.class);
    inverter.setOutputFormat(SequenceFileOutputFormat.class);
    inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    // run the inverter job
    LOG.info("Starting inverter job");
    try {
        JobClient.runJob(inverter);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished inverter job.");
}

From source file:org.apache.nutch.scoring.webgraph.LinkRank.java

License:Apache License

/**
 * Runs the link analysis job. The link analysis job applies the link rank
 * formula to create a score per url and stores that score in the NodeDb.
 * //  ww  w  .j a v  a  2  s. c  o  m
 * Typically the link analysis job is run a number of times to allow the link
 * rank scores to converge.
 * 
 * @param nodeDb The node database from which we are getting previous link
 * rank scores.
 * @param inverted The inverted inlinks
 * @param output The link analysis output.
 * @param iteration The current iteration number.
 * @param numIterations The total number of link analysis iterations
 * 
 * @throws IOException If an error occurs during link analysis.
 */
private void runAnalysis(Path nodeDb, Path inverted, Path output, int iteration, int numIterations,
        float rankOne) throws IOException {

    JobConf analyzer = new NutchJob(getConf());
    analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1));
    analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) + " of " + numIterations);
    FileInputFormat.addInputPath(analyzer, nodeDb);
    FileInputFormat.addInputPath(analyzer, inverted);
    FileOutputFormat.setOutputPath(analyzer, output);
    analyzer.set("link.analyze.rank.one", String.valueOf(rankOne));
    analyzer.setMapOutputKeyClass(Text.class);
    analyzer.setMapOutputValueClass(ObjectWritable.class);
    analyzer.setInputFormat(SequenceFileInputFormat.class);
    analyzer.setMapperClass(Analyzer.class);
    analyzer.setReducerClass(Analyzer.class);
    analyzer.setOutputKeyClass(Text.class);
    analyzer.setOutputValueClass(Node.class);
    analyzer.setOutputFormat(MapFileOutputFormat.class);
    analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false);

    LOG.info("Starting analysis job");
    try {
        JobClient.runJob(analyzer);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    LOG.info("Finished analysis job.");
}

From source file:org.apache.nutch.scoring.webgraph.Loops.java

License:Apache License

/**
 * Runs the various loop jobs.//  w ww .j  ava  2  s .  co m
 */
public void findLoops(Path webGraphDb) throws IOException {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Loops: starting at " + sdf.format(start));
        LOG.info("Loops: webgraphdb: " + webGraphDb);
    }

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path outlinkDb = new Path(webGraphDb, WebGraph.OUTLINK_DIR);
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Path routes = new Path(webGraphDb, ROUTES_DIR);
    Path tempRoute = new Path(webGraphDb,
            ROUTES_DIR + "-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));

    // run the initializer
    JobConf init = new NutchJob(conf);
    init.setJobName("Initializer: " + webGraphDb);
    FileInputFormat.addInputPath(init, outlinkDb);
    FileInputFormat.addInputPath(init, nodeDb);
    init.setInputFormat(SequenceFileInputFormat.class);
    init.setMapperClass(Initializer.class);
    init.setReducerClass(Initializer.class);
    init.setMapOutputKeyClass(Text.class);
    init.setMapOutputValueClass(ObjectWritable.class);
    init.setOutputKeyClass(Text.class);
    init.setOutputValueClass(Route.class);
    FileOutputFormat.setOutputPath(init, tempRoute);
    init.setOutputFormat(SequenceFileOutputFormat.class);

    try {
        LOG.info("Loops: starting initializer");
        JobClient.runJob(init);
        LOG.info("Loops: installing initializer " + routes);
        FSUtils.replace(fs, routes, tempRoute, true);
        LOG.info("Loops: finished initializer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }

    // run the loops job for a maxdepth, default 2, which will find a 3 link
    // loop cycle
    int depth = conf.getInt("link.loops.depth", 2);
    for (int i = 0; i < depth; i++) {

        JobConf looper = new NutchJob(conf);
        looper.setJobName("Looper: " + (i + 1) + " of " + depth);
        FileInputFormat.addInputPath(looper, outlinkDb);
        FileInputFormat.addInputPath(looper, routes);
        looper.setInputFormat(SequenceFileInputFormat.class);
        looper.setMapperClass(Looper.class);
        looper.setReducerClass(Looper.class);
        looper.setMapOutputKeyClass(Text.class);
        looper.setMapOutputValueClass(ObjectWritable.class);
        looper.setOutputKeyClass(Text.class);
        looper.setOutputValueClass(Route.class);
        FileOutputFormat.setOutputPath(looper, tempRoute);
        looper.setOutputFormat(SequenceFileOutputFormat.class);
        looper.setBoolean("last", i == (depth - 1));

        try {
            LOG.info("Loops: starting looper");
            JobClient.runJob(looper);
            LOG.info("Loops: installing looper " + routes);
            FSUtils.replace(fs, routes, tempRoute, true);
            LOG.info("Loops: finished looper");
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    // run the finalizer
    JobConf finalizer = new NutchJob(conf);
    finalizer.setJobName("Finalizer: " + webGraphDb);
    FileInputFormat.addInputPath(finalizer, routes);
    finalizer.setInputFormat(SequenceFileInputFormat.class);
    finalizer.setMapperClass(Finalizer.class);
    finalizer.setReducerClass(Finalizer.class);
    finalizer.setMapOutputKeyClass(Text.class);
    finalizer.setMapOutputValueClass(Route.class);
    finalizer.setOutputKeyClass(Text.class);
    finalizer.setOutputValueClass(LoopSet.class);
    FileOutputFormat.setOutputPath(finalizer, new Path(webGraphDb, LOOPS_DIR));
    finalizer.setOutputFormat(MapFileOutputFormat.class);

    try {
        LOG.info("Loops: starting finalizer");
        JobClient.runJob(finalizer);
        LOG.info("Loops: finished finalizer");
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.scoring.webgraph.NodeDumper.java

License:Apache License

/**
 * Runs the process to dump the top urls out to a text file.
 *
 * @param webGraphDb The WebGraph from which to pull values.
 *
 * @param topN/*from  w  w w.j a v  a2  s. c o m*/
 * @param output
 *
 * @throws IOException If an error occurs while dumping the top values.
 */
public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType,
        AggrType aggrType, boolean asSequenceFile) throws Exception {

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    LOG.info("NodeDumper: starting at " + sdf.format(start));
    Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR);
    Configuration conf = getConf();

    JobConf dumper = new NutchJob(conf);
    dumper.setJobName("NodeDumper: " + webGraphDb);
    FileInputFormat.addInputPath(dumper, nodeDb);
    dumper.setInputFormat(SequenceFileInputFormat.class);

    if (nameType == null) {
        dumper.setMapperClass(Sorter.class);
        dumper.setReducerClass(Sorter.class);
        dumper.setMapOutputKeyClass(FloatWritable.class);
        dumper.setMapOutputValueClass(Text.class);
    } else {
        dumper.setMapperClass(Dumper.class);
        dumper.setReducerClass(Dumper.class);
        dumper.setMapOutputKeyClass(Text.class);
        dumper.setMapOutputValueClass(FloatWritable.class);
    }

    dumper.setOutputKeyClass(Text.class);
    dumper.setOutputValueClass(FloatWritable.class);
    FileOutputFormat.setOutputPath(dumper, output);

    if (asSequenceFile) {
        dumper.setOutputFormat(SequenceFileOutputFormat.class);
    } else {
        dumper.setOutputFormat(TextOutputFormat.class);
    }

    dumper.setNumReduceTasks(1);
    dumper.setBoolean("inlinks", type == DumpType.INLINKS);
    dumper.setBoolean("outlinks", type == DumpType.OUTLINKS);
    dumper.setBoolean("scores", type == DumpType.SCORES);

    dumper.setBoolean("host", nameType == NameType.HOST);
    dumper.setBoolean("domain", nameType == NameType.DOMAIN);
    dumper.setBoolean("sum", aggrType == AggrType.SUM);
    dumper.setBoolean("max", aggrType == AggrType.MAX);

    dumper.setLong("topn", topN);

    // Set equals-sign as separator for Solr's ExternalFileField
    if (asEff) {
        dumper.set("mapred.textoutputformat.separator", "=");
    }

    try {
        LOG.info("NodeDumper: running");
        JobClient.runJob(dumper);
    } catch (IOException e) {
        LOG.error(StringUtils.stringifyException(e));
        throw e;
    }
    long end = System.currentTimeMillis();
    LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}