Example usage for org.apache.hadoop.mapred JobConf getBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getBoolean.

Prototype

public boolean getBoolean(String name, boolean defaultValue)

Source Link

Document

Get the value of the name property as a boolean.

Usage

From source file:net.peacesoft.nutch.crawl.RaovatPostDeleteDuplicates.java

License:Apache License

public void configure(JobConf job) {
    try {/*  w w  w.j av a 2 s  .co  m*/
        solr = SolrUtils.getCommonsHttpSolrServer(job);
        noCommit = job.getBoolean("noCommit", false);
    } catch (MalformedURLException e) {
        throw new RuntimeException(e);
    }
}

From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java

License:Apache License

public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed,
        boolean force) throws IOException {
    try {//from w  ww.  j a  va  2 s. com
        FileSystem fs = FileSystem.get(getConf());
        Path lock = new Path(crawlDb, LOCK_NAME);
        try {
            LockUtil.createLockFile(fs, lock, force);
        } catch (Exception ex) {
        }
        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long start = System.currentTimeMillis();

        JobConf job = ReCrawlDb.createJob(getConf(), crawlDb);
        job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
        job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
        job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);

        boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);

        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb update: starting at " + sdf.format(start));
            LOG.info("CrawlDb update: db: " + crawlDb);
            LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
            LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
            LOG.info("CrawlDb update: URL normalizing: " + normalize);
            LOG.info("CrawlDb update: URL filtering: " + filter);
            LOG.info("CrawlDb update: 404 purging: " + url404Purging);
        }

        for (int i = 0; i < segments.length; i++) {
            Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
            Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
            if (fs.exists(fetch) && fs.exists(parse)) {
                FileInputFormat.addInputPath(job, fetch);
                FileInputFormat.addInputPath(job, parse);
            } else {
                LOG.info(" - skipping invalid segment " + segments[i]);
            }
        }

        if (LOG.isInfoEnabled()) {
            LOG.info("ReCrawlDb update: Merging segment data into db.");
        }
        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            Path outPath = FileOutputFormat.getOutputPath(job);
            if (fs.exists(outPath)) {
                fs.delete(outPath, true);
            }
            throw e;
        }

        ReCrawlDb.install(job, crawlDb);
        long end = System.currentTimeMillis();
        LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
                + TimingUtil.elapsedTime(start, end));
    } catch (Exception ex) {
        LOG.error("ReCrawlDb update error: " + ex.toString(), ex);
    }
}

From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java

License:Apache License

public static void install(JobConf job, Path crawlDb) throws IOException {
    boolean preserveBackup = job.getBoolean("db.preserve.backup", true);

    Path newCrawlDb = FileOutputFormat.getOutputPath(job);
    FileSystem fs = new JobClient(job).getFs();
    Path old = new Path(crawlDb, "old");
    Path current = new Path(crawlDb, CURRENT_NAME);
    if (fs.exists(current)) {
        if (fs.exists(old)) {
            fs.delete(old, true);//  ww  w .  j  ava  2 s  .  co  m
        }
        fs.rename(current, old);
    }
    fs.mkdirs(crawlDb);
    fs.rename(newCrawlDb, current);
    if (!preserveBackup && fs.exists(old)) {
        fs.delete(old, true);
    }
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.removeLockFile(fs, lock);
}

From source file:net.peacesoft.nutch.crawl.ReIndexerMapReduce.java

License:Apache License

public void configure(JobConf job) {
    setConf(job);//w w  w. j av  a 2  s  . c om
    this.filters = new IndexingFilters(getConf());
    this.scfilters = new ScoringFilters(getConf());
    this.delete = job.getBoolean(INDEXER_DELETE, false);
    this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false);

    normalize = job.getBoolean(URL_NORMALIZING, false);
    filter = job.getBoolean(URL_FILTERING, false);

    if (normalize) {
        urlNormalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT);
    }

    if (filter) {
        urlFilters = new URLFilters(getConf());
    }
}

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

public void configure(JobConf job) {
    maxAnchorLength = job.getInt("db.max.anchor.length", 100);
    ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true);
    if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
        urlFilters = new URLFilters(job);
    }//from   www  . ja  v a 2s.c om
    if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
        urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB);
    }
}

From source file:net.peacesoft.nutch.crawl.ReLinkDb.java

License:Apache License

public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force)
        throws IOException {
    JobConf job = ReLinkDb.createJob(getConf(), linkDb, normalize, filter);
    Path lock = new Path(linkDb, LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    try {//w w  w.j a v a 2  s.  com
        LockUtil.createLockFile(fs, lock, force);
    } catch (Exception ex) {
    }
    Path currentLinkDb = new Path(linkDb, CURRENT_NAME);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("ReLinkDb: starting at " + sdf.format(start));
        LOG.info("ReLinkDb: linkdb: " + linkDb);
        LOG.info("ReLinkDb: URL normalize: " + normalize);
        LOG.info("ReLinkDb: URL filter: " + filter);
        if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
            LOG.info("ReLinkDb: internal links will be ignored.");
        }
    }

    for (int i = 0; i < segments.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("ReLinkDb: adding segment: " + segments[i]);
        }
        try {
            Path segmentPath = new Path(segments[i], ParseData.DIR_NAME);
            FileStatus[] files = fs.listStatus(segments[i], HadoopFSUtil.getPassDirectoriesFilter(fs));
            if (files.length != 6) {
                fs.delete(segments[i], true);
            } else {
                FileInputFormat.addInputPath(job, segmentPath);
            }
        } catch (FileNotFoundException fex) {
            try {
                fs.delete(segments[i], true);
            } catch (Exception ex) {
            }
        }
    }
    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        throw e;
    }
    if (fs.exists(currentLinkDb)) {
        if (LOG.isInfoEnabled()) {
            LOG.info("ReLinkDb: merging with existing linkdb: " + linkDb);
        }
        // try to merge
        Path newLinkDb = FileOutputFormat.getOutputPath(job);
        job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter);
        FileInputFormat.addInputPath(job, currentLinkDb);
        FileInputFormat.addInputPath(job, newLinkDb);
        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(newLinkDb, true);
            throw e;
        }
        fs.delete(newLinkDb, true);
    }
    LinkDb.install(job, linkDb);

    long end = System.currentTimeMillis();
    LOG.info("ReLinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:net.peacesoft.nutch.crawl.ReSolrWriter.java

License:Apache License

void init(SolrServer server, JobConf job) throws IOException {
    solr = server;// w  w w . j ava  2s . c  o  m
    commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000);
    solrMapping = SolrMappingReader.getInstance(job);
    delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false);
    // parse optional params
    params = new ModifiableSolrParams();
    String paramString = job.get(SolrConstants.PARAMS);
    if (paramString != null) {
        String[] values = paramString.split("&");
        for (String v : values) {
            String[] kv = v.split("=");
            if (kv.length < 2) {
                continue;
            }
            params.add(kv[0], kv[1]);
        }
    }
}

From source file:nl.tudelft.graphalytics.mapreducev2.evo.DirectedForestFireModelMap.java

License:Apache License

@Override
public void configure(JobConf conf) {
    TaskAttemptID attempt = TaskAttemptID.forName(conf.get("mapred.task.id"));
    this.taskID = attempt.getTaskID().getId(); // todo verify
    this.newVerticesPerSlot = conf.getInt(ForestFireModelUtils.NEW_VERTICES_NR, -1);
    this.maxID = conf.getLong(ForestFireModelUtils.MAX_ID, -1);
    this.isFirst = conf.getBoolean(ForestFireModelUtils.IS_INIT, false);
    this.isInit = this.isFirst;

    if (this.isInit)
        this.ambassadors = new HashMap<LongWritable, List<LongWritable>>();
    else//from  ww w .java2 s .c  om
        this.ambassadors = ForestFireModelUtils
                .verticesIdsString2Map(conf.get(ForestFireModelUtils.CURRENT_AMBASSADORS));
}

From source file:nl.tudelft.graphalytics.mapreducev2.evo.DirectedForestFireModelReducer.java

License:Apache License

@Override
public void configure(JobConf conf) {
    this.isInit = conf.getBoolean(ForestFireModelUtils.IS_INIT, false);
    this.maxID = conf.getLong(ForestFireModelUtils.MAX_ID, -1);
    this.pRatio = conf.getFloat(ForestFireModelUtils.P_RATIO, 0);
    this.rRatio = conf.getFloat(ForestFireModelUtils.R_RATIO, 0);
}

From source file:nl.tudelft.graphalytics.mapreducev2.evo.UndirectedForestFireModelMap.java

License:Apache License

@Override
public void configure(JobConf conf) {
    TaskAttemptID attempt = TaskAttemptID.forName(conf.get("mapred.task.id"));
    this.taskID = attempt.getTaskID().getId();
    this.newVerticesPerSlot = conf.getInt(ForestFireModelUtils.NEW_VERTICES_NR, -1);
    this.maxID = conf.getLong(ForestFireModelUtils.MAX_ID, -1);
    this.isFirst = conf.getBoolean(ForestFireModelUtils.IS_INIT, false);
    this.isInit = this.isFirst;

    if (this.isInit)
        this.ambassadors = new HashMap<LongWritable, List<LongWritable>>();
    else//from  w w w .j  a  v  a2 s  .  c o  m
        this.ambassadors = ForestFireModelUtils
                .verticesIdsString2Map(conf.get(ForestFireModelUtils.CURRENT_AMBASSADORS));
}