List of usage examples for org.apache.hadoop.mapred JobConf setBoolean
public void setBoolean(String name, boolean value)
name
property to a boolean
. From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.info("PowerTool: GetWeightedTermDocVectors"); JobConf conf = new JobConf(BuildWeightedTermDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = getConf().get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedTermDocVectorsDirectory(); int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0); int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0); String collectionName = getConf().get("Ivory.CollectionName"); String termsFilePath = env.getIndexTermsData(); String termsIdsFilePath = env.getIndexTermIdsData(); String termIdMappingFilePath = env.getIndexTermIdMappingData(); String dfByTermFilePath = env.getDfByTermData(); Path inputPath = new Path(env.getTermDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { //fs.delete(weightedVectorsPath, true); sLogger.info("Output path already exists!"); return 0; }//from w ww . j av a2 s . co m /* add terms file to cache */ if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath)) || !fs.exists(new Path(termIdMappingFilePath))) { throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/" + termIdMappingFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(termsFilePath), conf); DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf); DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf); /* add df table to cache */ if (!fs.exists(new Path(dfByTermFilePath))) { throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); conf.setJobName("GetWeightedTermDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE)); conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false)); if (getConf().get("Ivory.ShortDocLengths") != null) { conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths")); } conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel")); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(HMapSFW.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(HMapSFW.class); sLogger.info("Running job: " + conf.getJobName()); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:net.darkseraphim.webanalytics.hadoop.csv.CSVTextInputFormat.java
License:Apache License
@Override public RecordReader<LongWritable, List<Text>> getRecordReader(InputSplit split, JobConf conf, Reporter reporter) throws IOException { String quote = conf.get(CSVLineRecordReader.FORMAT_DELIMITER); String separator = conf.get(CSVLineRecordReader.FORMAT_SEPARATOR); conf.set(CSVLineRecordReader.FORMAT_DELIMITER, CSVLineRecordReader.DEFAULT_DELIMITER); conf.set(CSVLineRecordReader.FORMAT_SEPARATOR, CSVLineRecordReader.DEFAULT_SEPARATOR); conf.setBoolean(CSVLineRecordReader.IS_ZIPFILE, false); System.out.println("[LOG] Created reader"); if (split instanceof FileSplit) { return reader = new CSVLineRecordReader(split, conf); }/* w ww . j a va 2 s . c o m*/ throw new UnsupportedOperationException("Only FileSplits are supported"); }
From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("RaovatIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone); LOG.info("RaovatIndexer: URL filtering: " + filter); LOG.info("RaovatIndexer: URL normalizing: " + normalize); ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(ReSolrConstants.PARAMS, solrParams); }/*from w w w.ja v a2s.c om*/ NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go // SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); // // if (!noCommit) { // solr.commit(); // } long end = System.currentTimeMillis(); LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error("RaovatIndexer:" + e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:net.peacesoft.nutch.crawl.RaovatIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("RaovatIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("RaovatIndexer: deleting gone documents: " + deleteGone); LOG.info("RaovatIndexer: URL filtering: " + filter); LOG.info("RaovatIndexer: URL normalizing: " + normalize); IndexerMapReduce.initMRJob(crawlDb, linkDb, segments, job); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(ReSolrConstants.PARAMS, solrParams); }// ww w .j av a 2 s . c om NutchIndexWriterFactory.addClassToConf(job, RaovatPoster.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go // SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); // // if (!noCommit) { // solr.commit(); // } long end = System.currentTimeMillis(); LOG.info("RaovatIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error("RaovatIndexer: " + e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }
From source file:net.peacesoft.nutch.crawl.RaovatPostDeleteDuplicates.java
License:Apache License
public void dedup(String solrUrl, boolean noCommit) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("RaovatPostDeleteDuplicates: starting at " + sdf.format(start)); LOG.info("RaovatPostDeleteDuplicates: Solr url: " + solrUrl); JobConf job = new NutchJob(getConf()); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean("noCommit", noCommit); job.setInputFormat(RaovatPostDeleteDuplicates.SolrInputFormat.class); job.setOutputFormat(NullOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(RaovatPostDeleteDuplicates.SolrRecord.class); job.setMapperClass(IdentityMapper.class); job.setReducerClass(RaovatPostDeleteDuplicates.class); JobClient.runJob(job);/*from ww w . j a v a 2s. c om*/ long end = System.currentTimeMillis(); LOG.info("RaovatPostDeleteDuplicates: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java
License:Apache License
public void update(Path crawlDb, Path[] segments, boolean normalize, boolean filter, boolean additionsAllowed, boolean force) throws IOException { try {/*ww w . ja va 2 s. co m*/ FileSystem fs = FileSystem.get(getConf()); Path lock = new Path(crawlDb, LOCK_NAME); try { LockUtil.createLockFile(fs, lock, force); } catch (Exception ex) { } SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); JobConf job = ReCrawlDb.createJob(getConf(), crawlDb); job.setBoolean(CRAWLDB_ADDITIONS_ALLOWED, additionsAllowed); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); job.setBoolean(CrawlDbFilter.URL_NORMALIZING, normalize); boolean url404Purging = job.getBoolean(CRAWLDB_PURGE_404, false); if (LOG.isInfoEnabled()) { LOG.info("CrawlDb update: starting at " + sdf.format(start)); LOG.info("CrawlDb update: db: " + crawlDb); LOG.info("CrawlDb update: segments: " + Arrays.asList(segments)); LOG.info("CrawlDb update: additions allowed: " + additionsAllowed); LOG.info("CrawlDb update: URL normalizing: " + normalize); LOG.info("CrawlDb update: URL filtering: " + filter); LOG.info("CrawlDb update: 404 purging: " + url404Purging); } for (int i = 0; i < segments.length; i++) { Path fetch = new Path(segments[i], CrawlDatum.FETCH_DIR_NAME); Path parse = new Path(segments[i], CrawlDatum.PARSE_DIR_NAME); if (fs.exists(fetch) && fs.exists(parse)) { FileInputFormat.addInputPath(job, fetch); FileInputFormat.addInputPath(job, parse); } else { LOG.info(" - skipping invalid segment " + segments[i]); } } if (LOG.isInfoEnabled()) { LOG.info("ReCrawlDb update: Merging segment data into db."); } try { JobClient.runJob(job); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); Path outPath = FileOutputFormat.getOutputPath(job); if (fs.exists(outPath)) { fs.delete(outPath, true); } throw e; } ReCrawlDb.install(job, crawlDb); long end = System.currentTimeMillis(); LOG.info("CrawlDb update: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception ex) { LOG.error("ReCrawlDb update error: " + ex.toString(), ex); } }
From source file:net.peacesoft.nutch.crawl.ReCrawlDb.java
License:Apache License
public static JobConf createJob(Configuration config, Path crawlDb) throws IOException { Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("crawldb " + crawlDb); Path current = new Path(crawlDb, CURRENT_NAME); if (FileSystem.get(job).exists(current)) { FileInputFormat.addInputPath(job, current); }/*from w ww. j av a 2 s. com*/ job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbFilter.class); job.setReducerClass(CrawlDbReducer.class); FileOutputFormat.setOutputPath(job, newCrawlDb); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); // https://issues.apache.org/jira/browse/NUTCH-1110 job.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); return job; }
From source file:net.peacesoft.nutch.crawl.ReGenerator.java
License:Apache License
/** * Generate fetchlists in one or more segments. Whether to filter URLs or * not is read from the crawl.generate.filter property in the configuration * files. If the property is not found, the URLs are filtered. Same for the * normalisation.//from ww w .j a va2 s .c o m * * @param dbDir Crawl database directory * @param segments Segments directory * @param numLists Number of reduce tasks * @param topN Number of top URLs to be selected * @param curTime Current time in milliseconds * * @return Path to generated segment or null if no entries were selected * * @throws IOException When an I/O error occurs */ public Path[] generate(Path dbDir, Path segments, int numLists, long topN, long curTime, boolean filter, boolean norm, boolean force, int maxNumSegments) throws IOException { try { Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); Path lock = new Path(dbDir, CrawlDb.LOCK_NAME); FileSystem fs = FileSystem.get(getConf()); LockUtil.createLockFile(fs, lock, force); SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("ReGenerator: starting at " + sdf.format(start)); LOG.info("ReGenerator: Selecting best-scoring urls due for fetch."); LOG.info("ReGenerator: filtering: " + filter); LOG.info("ReGenerator: normalizing: " + norm); if (topN != Long.MAX_VALUE) { LOG.info("ReGenerator: topN: " + topN); } if ("true".equals(getConf().get(GENERATE_MAX_PER_HOST_BY_IP))) { LOG.info( "ReGenerator: GENERATE_MAX_PER_HOST_BY_IP will be ignored, use partition.url.mode instead"); } // map to inverted subset due for fetch, sort by score JobConf job = new NutchJob(getConf()); job.setJobName("generate: select from " + dbDir); if (numLists == -1) { // for politeness make numLists = job.getNumMapTasks(); // a partition per fetch task } if ("local".equals(job.get("mapred.job.tracker")) && numLists != 1) { // override LOG.info("ReGenerator: jobtracker is 'local', generating exactly one partition."); numLists = 1; } job.setLong(GENERATOR_CUR_TIME, curTime); // record real generation time long generateTime = System.currentTimeMillis(); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); job.setLong(GENERATOR_TOP_N, topN); job.setBoolean(GENERATOR_FILTER, filter); job.setBoolean(GENERATOR_NORMALISE, norm); job.setInt(GENERATOR_MAX_NUM_SEGMENTS, maxNumSegments); FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(Selector.class); job.setPartitionerClass(Selector.class); job.setReducerClass(Selector.class); FileOutputFormat.setOutputPath(job, tempDir); job.setOutputFormat(SequenceFileOutputFormat.class); job.setOutputKeyClass(FloatWritable.class); job.setOutputKeyComparatorClass(DecreasingFloatComparator.class); job.setOutputValueClass(SelectorEntry.class); job.setOutputFormat(GeneratorOutputFormat.class); try { JobClient.runJob(job); } catch (IOException e) { throw e; } // read the subdirectories generated in the temp // output and turn them into segments List<Path> generatedSegments = new ArrayList<Path>(); FileStatus[] status = fs.listStatus(tempDir); try { for (FileStatus stat : status) { Path subfetchlist = stat.getPath(); if (!subfetchlist.getName().startsWith("fetchlist-")) { continue; } // start a new partition job for this segment Path newSeg = partitionSegment(fs, segments, subfetchlist, numLists); generatedSegments.add(newSeg); } } catch (Exception e) { LOG.warn("ReGenerator: exception while partitioning segments, exiting ..."); fs.delete(tempDir, true); return null; } if (generatedSegments.size() == 0) { LOG.warn("ReGenerator: 0 records selected for fetching, exiting ..."); LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); return null; } if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) { // update the db from tempDir Path tempDir2 = new Path( getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis()); job = new NutchJob(getConf()); job.setJobName("generate: updatedb " + dbDir); job.setLong(Nutch.GENERATE_TIME_KEY, generateTime); for (Path segmpaths : generatedSegments) { Path subGenDir = new Path(segmpaths, CrawlDatum.GENERATE_DIR_NAME); FileInputFormat.addInputPath(job, subGenDir); } FileInputFormat.addInputPath(job, new Path(dbDir, CrawlDb.CURRENT_NAME)); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(CrawlDbUpdater.class); job.setReducerClass(CrawlDbUpdater.class); job.setOutputFormat(MapFileOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(CrawlDatum.class); FileOutputFormat.setOutputPath(job, tempDir2); try { JobClient.runJob(job); CrawlDb.install(job, dbDir); } catch (IOException e) { LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); fs.delete(tempDir2, true); throw e; } fs.delete(tempDir2, true); } LockUtil.removeLockFile(fs, lock); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info("ReGenerator: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); Path[] patharray = new Path[generatedSegments.size()]; return generatedSegments.toArray(patharray); } catch (Exception ex) { LOG.error("ReGenerator generate error: " + ex.toString(), ex); return null; } }
From source file:net.peacesoft.nutch.crawl.ReLinkDb.java
License:Apache License
private static JobConf createJob(Configuration config, Path linkDb, boolean normalize, boolean filter) { Path newLinkDb = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf job = new NutchJob(config); job.setJobName("linkdb " + linkDb); job.setInputFormat(SequenceFileInputFormat.class); job.setMapperClass(ReLinkDb.class); job.setCombinerClass(LinkDbMerger.class); // if we don't run the mergeJob, perform normalization/filtering now if (normalize || filter) { try {/*www.j a v a2s . co m*/ FileSystem fs = FileSystem.get(config); if (!fs.exists(linkDb)) { job.setBoolean(LinkDbFilter.URL_FILTERING, filter); job.setBoolean(LinkDbFilter.URL_NORMALIZING, normalize); } } catch (Exception e) { LOG.warn("ReLinkDb createJob: " + e); } } job.setReducerClass(LinkDbMerger.class); FileOutputFormat.setOutputPath(job, newLinkDb); job.setOutputFormat(MapFileOutputFormat.class); job.setBoolean("mapred.output.compress", true); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Inlinks.class); return job; }
From source file:net.peacesoft.nutch.crawl.ReSolrIndexer.java
License:Apache License
public void indexSolr(String solrUrl, Path inputDb, Path crawlDb, Path linkDb, List<Path> segments, boolean noCommit, boolean deleteGone, String solrParams, boolean filter, boolean normalize) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); LOG.info("SolrIndexer: starting at " + sdf.format(start)); final JobConf job = new NutchJob(getConf()); job.setJobName("index-solr " + solrUrl); LOG.info("SolrIndexer: deleting gone documents: " + deleteGone); LOG.info("SolrIndexer: URL filtering: " + filter); LOG.info("SolrIndexer: URL normalizing: " + normalize); ReIndexerMapReduce.initMRJob(inputDb, crawlDb, linkDb, segments, job); job.set(ReSolrConstants.SERVER_URL, solrUrl); job.setBoolean(IndexerMapReduce.INDEXER_DELETE, deleteGone); job.setBoolean(IndexerMapReduce.URL_FILTERING, filter); job.setBoolean(IndexerMapReduce.URL_NORMALIZING, normalize); if (solrParams != null) { job.set(ReSolrConstants.PARAMS, solrParams); }//from www. j av a 2 s. c o m NutchIndexWriterFactory.addClassToConf(job, ReSolrWriter.class); job.setReduceSpeculativeExecution(false); final Path tmp = new Path("tmp_" + System.currentTimeMillis() + "-" + new Random().nextInt()); FileOutputFormat.setOutputPath(job, tmp); try { JobClient.runJob(job); // do the commits once and for all the reducers in one go SolrServer solr = SolrUtils.getCommonsHttpSolrServer(job); if (!noCommit) { solr.commit(); } long end = System.currentTimeMillis(); LOG.info("SolrIndexer: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); } catch (Exception e) { LOG.error(e.toString()); } finally { FileSystem.get(job).delete(tmp, true); } }