List of usage examples for org.apache.hadoop.mapred JobConf getBoolean
public boolean getBoolean(String name, boolean defaultValue)
name
property as a boolean
. From source file:net.peacesoft.nutch.crawl.RaovatPostDeleteDuplicates.java
License:Apache License
public void configure(JobConf job) { try {/* w w w.j av a 2 s .co m*/ solr = SolrUtils.getCommonsHttpSolrServer(job); noCommit = job.getBoolean("noCommit", false); } catch (MalformedURLException e) { throw new RuntimeException(e); } }
From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java
License:Apache License
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException { try {//from w ww. j a va 2 s. com FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); try { LockUtil.createLockFile(fs, lock, force); } catch (Exception ex) { } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); JobConf job = ReCrawlDb.createJob(getConf(), crawlDb); job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting at " + sdf.format(start)); LOG.info("CrawlDb update: db: " + crawlDb); LOG.info("CrawlDb update: segments: " + Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); LOG.info("CrawlDb update: URL normalizing: " + normalize); LOG.info("CrawlDb update: URL filtering: " + filter); LOG.info("CrawlDb update: 404 purging: " + url404Purging); } for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); if (fs.exists(fetch) && fs.exists(parse)) { FileInputFormat.addInputPath(job, fetch); FileInputFormat.addInputPath(job, parse); } else { LOG.info(" - skipping invalid segment " + segments[i]); } } if (LOG.isInfoEnabled()) { LOG.info("ReCrawlDb update: Merging segment data into db."); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) { fs.delete(outPath, true); } throw e; } ReCrawlDb.install(job, crawlDb); long end = System.currentTimeMillis(); LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception ex) { LOG.error("ReCrawlDb update error: " + ex.toString(), ex); } }
From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java
License:Apache License
public static void install(JobConf job, Path crawlDb) throws IOException { boolean preserveBackup = job.getBoolean("db.preserve.backup", true); Path newCrawlDb = FileOutputFormat.getOutputPath(job); FileSystem fs = new JobClient(job).getFs(); Path old = new Path(crawlDb, "old"); Path current = new Path(crawlDb, CURRENT_NAME); if (fs.exists(current)) { if (fs.exists(old)) { fs.delete(old, true);// ww w . j ava 2 s . co m } fs.rename(current, old); } fs.mkdirs(crawlDb); fs.rename(newCrawlDb, current); if (!preserveBackup && fs.exists(old)) { fs.delete(old, true); } Path lock = new Path(crawlDb, LOCK_NAME); LockUtil.removeLockFile(fs, lock); }
From source file:net.peacesoft.nutch.crawl.ReIndexerMapReduce.java
License:Apache License
public void configure(JobConf job) { setConf(job);//w w w. j av a 2 s . c om this.filters = new IndexingFilters(getConf()); this.scfilters = new ScoringFilters(getConf()); this.delete = job.getBoolean(INDEXER_DELETE, false); this.skip = job.getBoolean(INDEXER_SKIP_NOTMODIFIED, false); normalize = job.getBoolean(URL_NORMALIZING, false); filter = job.getBoolean(URL_FILTERING, false); if (normalize) { urlNormalizers = new URLNormalizers(getConf(), URLNormalizers.SCOPE_DEFAULT); } if (filter) { urlFilters = new URLFilters(getConf()); } }
From source file:net.peacesoft.nutch.crawl.ReLinkDb.java
License:Apache License
public void configure(JobConf job) { maxAnchorLength = job.getInt("db.max.anchor.length", 100); ignoreInternalLinks = job.getBoolean(IGNORE_INTERNAL_LINKS, true); if (job.getBoolean(LinkDbFilter.URL_FILTERING, false)) { urlFilters = new URLFilters(job); }//from www . ja v a 2s.c om if (job.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) { urlNormalizers = new URLNormalizers(job, URLNormalizers.SCOPE_LINKDB); } }
From source file:net.peacesoft.nutch.crawl.ReLinkDb.java
License:Apache License
public void invert(Path linkDb, Path[] segments, boolean normalize, boolean filter, boolean force) throws IOException { JobConf job = ReLinkDb.createJob(getConf(), linkDb, normalize, filter); Path lock = new Path(linkDb, LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); try {//w w w.j a v a 2 s. com LockUtil.createLockFile(fs, lock, force); } catch (Exception ex) { } Path currentLinkDb = new Path(linkDb, CURRENT_NAME); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("ReLinkDb: starting at " + sdf.format(start)); LOG.info("ReLinkDb: linkdb: " + linkDb); LOG.info("ReLinkDb: URL normalize: " + normalize); LOG.info("ReLinkDb: URL filter: " + filter); if (job.getBoolean(IGNORE_INTERNAL_LINKS, true)) { LOG.info("ReLinkDb: internal links will be ignored."); } } for (int i = 0; i < segments.length; i++) { if (LOG.isInfoEnabled()) { LOG.info("ReLinkDb: adding segment: " + segments[i]); } try { Path segmentPath = new Path(segments[i], ParseData.DIR_NAME); FileStatus[] files = fs.listStatus(segments[i], HadoopFSUtil.getPassDirectoriesFilter(fs)); if (files.length != 6) { fs.delete(segments[i], true); } else { FileInputFormat.addInputPath(job, segmentPath); } } catch (FileNotFoundException fex) { try { fs.delete(segments[i], true); } catch (Exception ex) { } } } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); throw e; } if (fs.exists(currentLinkDb)) { if (LOG.isInfoEnabled()) { LOG.info("ReLinkDb: merging with existing linkdb: " + linkDb); } // try to merge Path newLinkDb = FileOutputFormat.getOutputPath(job); job = LinkDbMerger.createMergeJob(getConf(), linkDb, normalize, filter); FileInputFormat.addInputPath(job, currentLinkDb); FileInputFormat.addInputPath(job, newLinkDb); try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(newLinkDb, true); throw e; } fs.delete(newLinkDb, true); } LinkDb.install(job, linkDb); long end = System.currentTimeMillis(); LOG.info("ReLinkDb: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:net.peacesoft.nutch.crawl.ReSolrWriter.java
License:Apache License
void init(SolrServer server, JobConf job) throws IOException { solr = server;// w w w . j ava 2s . c o m commitSize = job.getInt(SolrConstants.COMMIT_SIZE, 1000); solrMapping = SolrMappingReader.getInstance(job); delete = job.getBoolean(IndexerMapReduce.INDEXER_DELETE, false); // parse optional params params = new ModifiableSolrParams(); String paramString = job.get(SolrConstants.PARAMS); if (paramString != null) { String[] values = paramString.split("&"); for (String v : values) { String[] kv = v.split("="); if (kv.length < 2) { continue; } params.add(kv[0], kv[1]); } } }
From source file:nl.tudelft.graphalytics.mapreducev2.evo.DirectedForestFireModelMap.java
License:Apache License
@Override public void configure(JobConf conf) { TaskAttemptID attempt = TaskAttemptID.forName(conf.get("mapred.task.id")); this.taskID = attempt.getTaskID().getId(); // todo verify this.newVerticesPerSlot = conf.getInt(ForestFireModelUtils.NEW_VERTICES_NR, -1); this.maxID = conf.getLong(ForestFireModelUtils.MAX_ID, -1); this.isFirst = conf.getBoolean(ForestFireModelUtils.IS_INIT, false); this.isInit = this.isFirst; if (this.isInit) this.ambassadors = new HashMap<LongWritable, List<LongWritable>>(); else//from ww w .java2 s .c om this.ambassadors = ForestFireModelUtils .verticesIdsString2Map(conf.get(ForestFireModelUtils.CURRENT_AMBASSADORS)); }
From source file:nl.tudelft.graphalytics.mapreducev2.evo.DirectedForestFireModelReducer.java
License:Apache License
@Override public void configure(JobConf conf) { this.isInit = conf.getBoolean(ForestFireModelUtils.IS_INIT, false); this.maxID = conf.getLong(ForestFireModelUtils.MAX_ID, -1); this.pRatio = conf.getFloat(ForestFireModelUtils.P_RATIO, 0); this.rRatio = conf.getFloat(ForestFireModelUtils.R_RATIO, 0); }
From source file:nl.tudelft.graphalytics.mapreducev2.evo.UndirectedForestFireModelMap.java
License:Apache License
@Override public void configure(JobConf conf) { TaskAttemptID attempt = TaskAttemptID.forName(conf.get("mapred.task.id")); this.taskID = attempt.getTaskID().getId(); this.newVerticesPerSlot = conf.getInt(ForestFireModelUtils.NEW_VERTICES_NR, -1); this.maxID = conf.getLong(ForestFireModelUtils.MAX_ID, -1); this.isFirst = conf.getBoolean(ForestFireModelUtils.IS_INIT, false); this.isInit = this.isFirst; if (this.isInit) this.ambassadors = new HashMap<LongWritable, List<LongWritable>>(); else//from w w w .j a v a2 s . c o m this.ambassadors = ForestFireModelUtils .verticesIdsString2Map(conf.get(ForestFireModelUtils.CURRENT_AMBASSADORS)); }