Example usage for org.apache.hadoop.mapred JobConf getBoolean

List of usage examples for org.apache.hadoop.mapred JobConf getBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getBoolean.

Prototype

public boolean getBoolean(String name, boolean defaultValue) 

Source Link

Document

Get the value of the name property as a boolean.

Usage

From source file:org.apache.ignite.internal.processors.hadoop.v2.HadoopV2JobResourceManager.java

License:Apache License

/**
 * Set working directory in local file system.
 *
 * @param dir Working directory.//from  w  w w .j  a  v a2  s.  c  om
 * @throws IOException If fails.
 */
private void setLocalFSWorkingDirectory(File dir) throws IOException {
    JobConf cfg = ctx.getJobConf();

    Thread.currentThread().setContextClassLoader(cfg.getClassLoader());

    try {
        cfg.set(HadoopFileSystemsUtils.LOC_FS_WORK_DIR_PROP, dir.getAbsolutePath());

        if (!cfg.getBoolean("fs.file.impl.disable.cache", false))
            FileSystem.getLocal(cfg).setWorkingDirectory(new Path(dir.getAbsolutePath()));
    } finally {
        Thread.currentThread().setContextClassLoader(null);
    }
}

From source file:org.apache.mahout.avro.text.mapred.WikipediaAvroDocumentMapper.java

License:Apache License

@Override
public void configure(JobConf job) {
    try {//from   w  w w .ja  va 2 s.c o m
        if (inputCategories == null) {
            Set<String> newCategories = new HashSet<String>();

            DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(job,
                    GenericsUtil.getClass(newCategories));

            String categoriesStr = setStringifier.toString(newCategories);
            categoriesStr = job.get("wikipedia.categories", categoriesStr);
            inputCategories = setStringifier.fromString(categoriesStr);

        }
        exactMatchOnly = job.getBoolean("exact.match.only", false);
        all = job.getBoolean("all.files", true);
    } catch (IOException ex) {
        throw new IllegalStateException(ex);
    }
    log.info("Configure: Input Categories size: " + inputCategories.size() + " All: " + all + " Exact Match: "
            + exactMatchOnly);
}

From source file:org.apache.nutch.crawl.CrawlDb.java

License:Apache License

public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed,
        boolean force) throws IOException {
    FileSystem fs = FileSystem.get(getConf());
    Path lock = new Path(crawlDb, LOCK_NAME);
    LockUtil.createLockFile(fs, lock, force);
    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();

    JobConf job = CrawlDb.createJob(getConf(), crawlDb);
    job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed);
    job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);
    job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize);

    boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false);

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: starting at " + sdf.format(start));
        LOG.info("CrawlDb update: db: " + crawlDb);
        LOG.info("CrawlDb update: segments: " + Arrays.asList(segments));
        LOG.info("CrawlDb update: additions allowed: " + additionsAllowed);
        LOG.info("CrawlDb update: URL normalizing: " + normalize);
        LOG.info("CrawlDb update: URL filtering: " + filter);
        LOG.info("CrawlDb update: 404 purging: " + url404Purging);
    }/*from   www .  ja  v  a 2  s  .  com*/

    for (int i = 0; i < segments.length; i++) {
        Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME);
        Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME);
        if (fs.exists(fetch) && fs.exists(parse)) {
            FileInputFormat.addInputPath(job, fetch);
            FileInputFormat.addInputPath(job, parse);
        } else {
            LOG.info(" - skipping invalid segment " + segments[i]);
        }
    }

    if (LOG.isInfoEnabled()) {
        LOG.info("CrawlDb update: Merging segment data into db.");
    }
    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        Path outPath = FileOutputFormat.getOutputPath(job);
        if (fs.exists(outPath))
            fs.delete(outPath, true);
        throw e;
    }

    CrawlDb.install(job, crawlDb);
    long end = System.currentTimeMillis();
    LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: "
            + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.crawl.CrawlDbFilter.java

License:Apache License

public void configure(JobConf job) {
    urlFiltering = job.getBoolean(URL_FILTERING, false);
    urlNormalizers = job.getBoolean(URL_NORMALIZING, false);
    url404Purging = job.getBoolean(CrawlDb.CRAWLDB_PURGE_404, false);

    if (urlFiltering) {
        filters = new URLFilters(job);
    }/*from w ww. j a v  a  2  s.com*/
    if (urlNormalizers) {
        scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_CRAWLDB);
        normalizers = new URLNormalizers(job, scope);
    }
}

From source file:org.apache.nutch.crawl.CrawlDbReducer.java

License:Apache License

public void configure(JobConf job) {
    retryMax = job.getInt("db.fetch.retry.max", 3);
    scfilters = new ScoringFilters(job);
    additionsAllowed = job.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
    int oldMaxInterval = job.getInt("db.max.fetch.interval", 0);
    maxInterval = job.getInt("db.fetch.interval.max", 0);
    if (oldMaxInterval > 0 && maxInterval == 0)
        maxInterval = oldMaxInterval * FetchSchedule.SECONDS_PER_DAY;
    schedule = FetchScheduleFactory.getFetchSchedule(job);
    int maxLinks = job.getInt("db.update.max.inlinks", 10000);
    linked = new InlinkPriorityQueue(maxLinks);
}

From source file:org.apache.nutch.crawl.LinkDb.java

License:Apache License

public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force)
        throws IOException {
    JobConf job = LinkDb.createJob(getConf(), linkDb, normalize, filter);
    Path lock = new Path(linkDb, LOCK_NAME);
    FileSystem fs = FileSystem.get(getConf());
    LockUtil.createLockFile(fs, lock, force);
    Path currentLinkDb = new Path(linkDb, CURRENT_NAME);

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("LinkDb: starting at " + sdf.format(start));
        LOG.info("LinkDb: linkdb: " + linkDb);
        LOG.info("LinkDb: URL normalize: " + normalize);
        LOG.info("LinkDb: URL filter: " + filter);
        if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
            LOG.info("LinkDb: internal links will be ignored.");
        }//from  ww  w .  j a v a  2s. c om
    }

    for (int i = 0; i < segments.length; i++) {
        if (LOG.isInfoEnabled()) {
            LOG.info("LinkDb: adding segment: " + segments[i]);
        }
        FileInputFormat.addInputPath(job, new Path(segments[i], ParseData.DIR_NAME));
    }
    try {
        JobClient.runJob(job);
    } catch (IOException e) {
        LockUtil.removeLockFile(fs, lock);
        throw e;
    }
    if (fs.exists(currentLinkDb)) {
        if (LOG.isInfoEnabled()) {
            LOG.info("LinkDb: merging with existing linkdb: " + linkDb);
        }
        // try to merge
        Path newLinkDb = FileOutputFormat.getOutputPath(job);
        job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter);
        FileInputFormat.addInputPath(job, currentLinkDb);
        FileInputFormat.addInputPath(job, newLinkDb);
        try {
            JobClient.runJob(job);
        } catch (IOException e) {
            LockUtil.removeLockFile(fs, lock);
            fs.delete(newLinkDb, true);
            throw e;
        }
        fs.delete(newLinkDb, true);
    }
    LinkDb.install(job, linkDb);

    long end = System.currentTimeMillis();
    LOG.info("LinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:org.apache.nutch.crawl.LinkDbFilter.java

License:Apache License

public void configure(JobConf job) {
    filter = job.getBoolean(URL_FILTERING, false);
    normalize = job.getBoolean(URL_NORMALIZING, false);
    if (filter) {
        filters = new URLFilters(job);
    }/*from  w w  w .j  a  v a  2s .c  om*/
    if (normalize) {
        scope = job.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB);
        normalizers = new URLNormalizers(job, scope);
    }
}

From source file:org.apache.nutch.parse.ParseOutputFormat.java

License:Apache License

public RecordWriter<Text, Parse> getRecordWriter(FileSystem fs, JobConf job, String name, Progressable progress)
        throws IOException {

    if (job.getBoolean("parse.filter.urls", true)) {
        filters = new URLFilters(job);
    }//from w w  w  . ja  va 2 s  .co  m

    if (job.getBoolean("parse.normalize.urls", true)) {
        normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK);
    }

    this.scfilters = new ScoringFilters(job);
    final int interval = job.getInt("db.fetch.interval.default", 2592000);
    final boolean ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false);
    int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100);
    final boolean isParsing = job.getBoolean("fetcher.parse", true);
    final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
    final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(job);
    Path out = FileOutputFormat.getOutputPath(job);

    Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
    Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
    Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);

    final String[] parseMDtoCrawlDB = job.get("db.parsemeta.to.crawldb", "").split(" *, *");

    final MapFile.Writer textOut = new MapFile.Writer(job, fs, text.toString(), Text.class, ParseText.class,
            CompressionType.RECORD, progress);

    final MapFile.Writer dataOut = new MapFile.Writer(job, fs, data.toString(), Text.class, ParseData.class,
            compType, progress);

    final SequenceFile.Writer crawlOut = SequenceFile.createWriter(fs, job, crawl, Text.class, CrawlDatum.class,
            compType, progress);

    return new RecordWriter<Text, Parse>() {

        public void write(Text key, Parse parse) throws IOException {

            String fromUrl = key.toString();
            String fromHost = null;
            textOut.append(key, new ParseText(parse.getText()));

            ParseData parseData = parse.getData();
            // recover the signature prepared by Fetcher or ParseSegment
            String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
            if (sig != null) {
                byte[] signature = StringUtil.fromHexString(sig);
                if (signature != null) {
                    // append a CrawlDatum with a signature
                    CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
                    d.setSignature(signature);
                    crawlOut.append(key, d);
                }
            }

            // see if the parse metadata contain things that we'd like
            // to pass to the metadata of the crawlDB entry
            CrawlDatum parseMDCrawlDatum = null;
            for (String mdname : parseMDtoCrawlDB) {
                String mdvalue = parse.getData().getParseMeta().get(mdname);
                if (mdvalue != null) {
                    if (parseMDCrawlDatum == null)
                        parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
                    parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
                }
            }
            if (parseMDCrawlDatum != null)
                crawlOut.append(key, parseMDCrawlDatum);

            try {
                ParseStatus pstatus = parseData.getStatus();
                if (pstatus != null && pstatus.isSuccess()
                        && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                    String newUrl = pstatus.getMessage();
                    int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);

                    try {
                        if (normalizers != null) {
                            newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
                        }
                    } catch (MalformedURLException mfue) {
                        newUrl = null;
                    }

                    if (filters != null) {
                        if (newUrl != null)
                            newUrl = filters.filter(newUrl);
                    }

                    String url = key.toString();
                    if (newUrl != null && !newUrl.equals(url)) {
                        String reprUrl = URLUtil.chooseRepr(url, newUrl,
                                refreshTime < Fetcher.PERM_REFRESH_TIME);
                        CrawlDatum newDatum = new CrawlDatum();
                        newDatum.setStatus(CrawlDatum.STATUS_LINKED);
                        if (reprUrl != null && !reprUrl.equals(newUrl)) {
                            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
                        }
                        crawlOut.append(new Text(newUrl), newDatum);
                    }
                }
            } catch (URLFilterException e) {
                // ignore
            }

            // collect outlinks for subsequent db update
            Outlink[] links = parseData.getOutlinks();
            int outlinksToStore = Math.min(maxOutlinks, links.length);
            if (ignoreExternalLinks) {
                try {
                    fromHost = new URL(fromUrl).getHost().toLowerCase();
                } catch (MalformedURLException e) {
                    fromHost = null;
                }
            } else {
                fromHost = null;
            }

            int validCount = 0;
            CrawlDatum adjust = null;
            List<Entry<Text, CrawlDatum>> targets = new ArrayList<Entry<Text, CrawlDatum>>(outlinksToStore);
            List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
            for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                String toUrl = links[i].getToUrl();

                // Only normalize and filter if fetcher.parse = false
                if (!isParsing) {
                    toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, fromHost, ignoreExternalLinks,
                            filters, normalizers);
                    if (toUrl == null) {
                        continue;
                    }
                }

                CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
                Text targetUrl = new Text(toUrl);
                try {
                    scfilters.initialScore(targetUrl, target);
                } catch (ScoringFilterException e) {
                    LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
                    target.setScore(0.0f);
                }

                targets.add(new SimpleEntry(targetUrl, target));

                // OVerwrite URL in Outlink object with normalized URL (NUTCH-1174)
                links[i].setUrl(toUrl);
                outlinkList.add(links[i]);
                validCount++;
            }

            try {
                // compute score contributions and adjustment to the original score
                adjust = scfilters.distributeScoreToOutlinks((Text) key, parseData, targets, null,
                        links.length);
            } catch (ScoringFilterException e) {
                LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
            }
            for (Entry<Text, CrawlDatum> target : targets) {
                crawlOut.append(target.getKey(), target.getValue());
            }
            if (adjust != null)
                crawlOut.append(key, adjust);

            Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
            parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks,
                    parseData.getContentMeta(), parseData.getParseMeta());
            dataOut.append(key, parseData);
            if (!parse.isCanonical()) {
                CrawlDatum datum = new CrawlDatum();
                datum.setStatus(CrawlDatum.STATUS_FETCH_SUCCESS);
                String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
                try {
                    datum.setFetchTime(Long.parseLong(timeString));
                } catch (Exception e) {
                    LOG.warn("Can't read fetch time for: " + key);
                    datum.setFetchTime(System.currentTimeMillis());
                }
                crawlOut.append(key, datum);
            }
        }

        public void close(Reporter reporter) throws IOException {
            textOut.close();
            dataOut.close();
            crawlOut.close();
        }

    };

}

From source file:org.apache.nutch.parse.ParseSegment.java

License:Apache License

public void configure(JobConf job) {
    setConf(job);/*  w  w  w.  ja  v  a2s  .  c o  m*/
    this.scfilters = new ScoringFilters(job);
    skipTruncated = job.getBoolean(SKIP_TRUNCATED, true);
}

From source file:org.apache.nutch.tools.compat.CrawlDbConverter.java

License:Apache License

public void configure(JobConf job) {
    setConf(job);
    withMetadata = job.getBoolean(CONVERT_META_KEY, false);
    newKey = new Text();
}