List of usage examples for org.apache.hadoop.mapred JobConf getLocalPath
public Path getLocalPath(String pathString) throws IOException
From source file:de.tudarmstadt.ukp.dkpro.bigdata.hadoop.UIMAMapReduceBase.java
License:Open Source License
@Override public void configure(JobConf job) { try {/* ww w .jav a2s . c o m*/ this.job = job; this.mapOutputValueClass = job.getMapOutputValueClass(); this.outputValueClass = job.getOutputValueClass(); this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100); final EngineFactory engineFactory = (EngineFactory) Class .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance(); engineFactory.configure(job); final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job); // replace the $dir variable within the configuration. this.fs = FileSystem.get(job); this.localFS = FileSystem.getLocal(job); this.working_dir = new Path("uima_output_" + job.get("mapred.task.id")); final Path outputPath = FileOutputFormat.getOutputPath(job); this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName())); this.localFS.mkdirs(this.results_dir); final String[] resources = job.get("dkpro.resources", "").split(","); sLogger.info("Writing local data to: " + this.results_dir); this.resourceURIs = new TreeMap<String, URL>(); for (final String resource : resources) { final URL r = job.getResource(resource); if (r != null && !resource.isEmpty()) { this.resourceURIs.put(resource, r); } } replaceRecursively(engineDescription); this.engine = createEngine(engineDescription); } catch (final Exception e) { sLogger.fatal("Error while configuring pipeline", e); e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.apache.nutch.crawl.Crawl.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 1) { System.out.println("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]"); return -1; }/*from w ww . j ava 2s . com*/ Path rootUrlDir = null; Path dir = new Path("crawl-" + getDate()); int threads = getConf().getInt("fetcher.threads.fetch", 10); int depth = 5; long topN = Long.MAX_VALUE; String solrUrl = null; for (int i = 0; i < args.length; i++) { if ("-dir".equals(args[i])) { dir = new Path(args[i + 1]); i++; } else if ("-threads".equals(args[i])) { threads = Integer.parseInt(args[i + 1]); i++; } else if ("-depth".equals(args[i])) { depth = Integer.parseInt(args[i + 1]); i++; } else if ("-topN".equals(args[i])) { topN = Integer.parseInt(args[i + 1]); i++; } else if ("-solr".equals(args[i])) { solrUrl = args[i + 1]; i++; } else if (args[i] != null) { rootUrlDir = new Path(args[i]); } } JobConf job = new NutchJob(getConf()); if (solrUrl == null) { LOG.warn("solrUrl is not set, indexing will be skipped..."); } FileSystem fs = FileSystem.get(job); if (LOG.isInfoEnabled()) { LOG.info("crawl started in: " + dir); LOG.info("rootUrlDir = " + rootUrlDir); LOG.info("threads = " + threads); LOG.info("depth = " + depth); LOG.info("solrUrl=" + solrUrl); if (topN != Long.MAX_VALUE) LOG.info("topN = " + topN); } Path crawlDb = new Path(dir + "/crawldb"); Path linkDb = new Path(dir + "/linkdb"); Path segments = new Path(dir + "/segments"); Path indexes = new Path(dir + "/indexes"); Path index = new Path(dir + "/index"); Path tmpDir = job.getLocalPath("crawl" + Path.SEPARATOR + getDate()); Injector injector = new Injector(getConf()); Generator generator = new Generator(getConf()); Fetcher fetcher = new Fetcher(getConf()); ParseSegment parseSegment = new ParseSegment(getConf()); CrawlDb crawlDbTool = new CrawlDb(getConf()); LinkDb linkDbTool = new LinkDb(getConf()); // initialize crawlDb injector.inject(crawlDb, rootUrlDir); int i; for (i = 0; i < depth; i++) { // generate new segment Path[] segs = generator.generate(crawlDb, segments, -1, topN, System.currentTimeMillis()); if (segs == null) { LOG.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; } fetcher.fetch(segs[0], threads); // fetch it if (!Fetcher.isParsing(job)) { parseSegment.parse(segs[0]); // parse it, if needed } crawlDbTool.update(crawlDb, segs, true, true); // update crawldb } if (i > 0) { linkDbTool.invert(linkDb, segments, true, true, false); // invert links if (solrUrl != null) { // index, dedup & merge FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs)); SolrIndexer indexer = new SolrIndexer(getConf()); indexer.indexSolr(solrUrl, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats))); SolrDeleteDuplicates dedup = new SolrDeleteDuplicates(); dedup.setConf(getConf()); dedup.dedup(solrUrl); } } else { LOG.warn("No URLs to fetch - check your seed list and URL filters."); } if (LOG.isInfoEnabled()) { LOG.info("crawl finished: " + dir); } return 0; }
From source file:org.apache.nutch.indexer.lucene.LuceneWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { this.fs = FileSystem.get(job); perm = new Path(FileOutputFormat.getOutputPath(job), name); temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); fs.delete(perm, true); // delete old, if any analyzerFactory = new AnalyzerFactory(job); writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED); writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000)); writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG)); writer.setUseCompoundFile(false);/* w w w . j a v a 2 s .c o m*/ writer.setSimilarity(new NutchSimilarity()); processOptions(job); }
From source file:org.apache.nutch.indexwriter.lucene.LuceneWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { this.fs = FileSystem.get(job); perm = new Path(FileOutputFormat.getOutputPath(job), name); temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); fs.delete(perm, true); // delete old, if any analyzerFactory = new AnalyzerFactory(job); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, new SmartChineseAnalyzer()); LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); mergePolicy.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); indexWriterConfig.setMergePolicy(mergePolicy); indexWriterConfig.setUseCompoundFile(false); indexWriterConfig.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); indexWriterConfig.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); writer = new org.apache.lucene.index.IndexWriter( FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), indexWriterConfig); /*//from w ww. j a va2 s . c o m * addFieldOptions("title", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("url", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("content", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("lang", STORE.YES, INDEX.UNTOKENIZED, VECTOR.NO, job); */ processOptions(job); }
From source file:org.archive.jbs.lucene.LuceneOutputFormat.java
License:Apache License
public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, final String name, final Progressable progress) throws IOException { // Open Lucene index in ${temp} this.fs = FileSystem.get(job); this.job = job; this.perm = new Path(FileOutputFormat.getOutputPath(job), name); this.temp = job.getLocalPath("index/_" + (new Random().nextInt())); this.fs.delete(perm, true); // delete old, if any indexer = new IndexWriter(new NIOFSDirectory(new File(fs.startLocalOutput(perm, temp).toString())), new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED); indexer.setMergeFactor(job.getInt("jbs.lucene.mergeFactor", 100)); indexer.setMaxMergeDocs(job.getInt("jbs.lucene.maxMergeDocs", Integer.MAX_VALUE)); indexer.setRAMBufferSizeMB(job.getInt("jbs.lucene.maxRAMBufferSize", 512)); indexer.setTermIndexInterval(/* w w w .j ava2 s. c o m*/ job.getInt("jbs.lucene.termIndexInterval", IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL)); indexer.setMaxFieldLength(job.getInt("jbs.lucene.max.tokens", Integer.MAX_VALUE)); indexer.setUseCompoundFile(false); indexer.setSimilarity(new WebSimilarity()); LuceneDocumentWriter docWriter = buildDocumentWriter(job, indexer); return new LuceneRecordWriter(docWriter); }
From source file:org.dkpro.bigdata.hadoop.UIMAMapReduceBase.java
License:Open Source License
@Override public void configure(JobConf job) { try {//from w ww . j ava2s . com this.job = job; this.inputName = job.get("mapred.input.dir"); this.taskId = job.get("mapred.task.id"); this.mapOutputValueClass = job.getMapOutputValueClass(); this.outputValueClass = job.getOutputValueClass(); this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100); final EngineFactory engineFactory = (EngineFactory) Class .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance(); engineFactory.configure(job); final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job); // replace the $dir variable within the configuration. this.fs = FileSystem.get(job); this.localFS = FileSystem.getLocal(job); if (job.getBoolean("dkpro.output.onedirpertask", true)) { this.working_dir = new Path("uima_output_" + job.get("mapred.task.id")); } else { this.working_dir = new Path("uima_output"); } final Path outputPath = FileOutputFormat.getOutputPath(job); this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName())); this.localFS.mkdirs(this.results_dir); final String[] resources = job.get("dkpro.resources", "").split(","); sLogger.info("Writing local data to: " + this.results_dir); this.resourceURIs = new TreeMap<String, URL>(); for (final String resource : resources) { final URL r = job.getResource(resource); if (r != null && !resource.isEmpty()) { this.resourceURIs.put(resource, r); } } Map<String, String> variableValues = new HashMap<String, String>(); variableValues.put("\\$dir", this.results_dir.toString()); variableValues.put("\\$input", this.inputName); variableValues.put("\\$taskid", this.taskId); Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job); if (cacheFiles != null) { for (Path cacheFile : cacheFiles) { variableValues.put("^\\$cache/" + cacheFile.getName(), cacheFile.toUri().getPath()); } } for (final Entry<String, URL> resource : this.resourceURIs.entrySet()) { variableValues.put("\\$" + resource, resource.getValue().toString()); } AnalysisEngineUtil.replaceVariables(engineDescription, variableValues); this.engine = createEngine(engineDescription); } catch (final Exception e) { sLogger.fatal("Error while configuring pipeline", e); e.printStackTrace(); throw new RuntimeException(e); } }
From source file:org.hbasene.index.create.mapred.IndexOutputFormat.java
License:Apache License
@Override public RecordWriter<ImmutableBytesWritable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs, JobConf job, String name, final Progressable progress) throws IOException { final Path perm = new Path(FileOutputFormat.getOutputPath(job), name); final Path temp = job.getLocalPath("index/_" + Integer.toString(random.nextInt())); LOG.info("To index into " + perm); // delete old, if any fs.delete(perm, true);/*w w w . j a v a 2 s . co m*/ final IndexConfiguration indexConf = new IndexConfiguration(); String content = job.get("hbase.index.conf"); if (content != null) { indexConf.addFromXML(content); } String analyzerName = indexConf.getAnalyzerName(); Analyzer analyzer; try { Class<? extends Analyzer> analyzerClass = Class.forName(analyzerName).asSubclass(Analyzer.class); Constructor<? extends Analyzer> analyzerCtor = analyzerClass.getConstructor(Version.class); analyzer = analyzerCtor.newInstance(Version.LUCENE_30); } catch (Exception e) { throw new IOException("Error in creating an analyzer object " + analyzerName); } // build locally first final IndexWriter writer = new IndexWriter( FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), analyzer, true, IndexWriter.MaxFieldLength.LIMITED); // no delete, so no need for maxBufferedDeleteTerms writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs()); writer.setMaxFieldLength(indexConf.getMaxFieldLength()); writer.setMaxMergeDocs(indexConf.getMaxMergeDocs()); writer.setMergeFactor(indexConf.getMergeFactor()); String similarityName = indexConf.getSimilarityName(); if (similarityName != null) { try { Class<? extends Similarity> similarityClass = Class.forName(similarityName) .asSubclass(Similarity.class); Constructor<? extends Similarity> ctor = similarityClass.getConstructor(Version.class); Similarity similarity = ctor.newInstance(Version.LUCENE_30); writer.setSimilarity(similarity); } catch (Exception e) { throw new IOException("Error in creating a similarity object " + similarityName); } } writer.setUseCompoundFile(indexConf.isUseCompoundFile()); return new RecordWriter<ImmutableBytesWritable, LuceneDocumentWrapper>() { AtomicBoolean closed = new AtomicBoolean(false); private long docCount = 0; public void write(ImmutableBytesWritable key, LuceneDocumentWrapper value) throws IOException { // unwrap and index doc Document doc = value.get(); writer.addDocument(doc); docCount++; progress.progress(); } public void close(final Reporter reporter) throws IOException { // spawn a thread to give progress heartbeats Thread prog = new Thread() { @Override public void run() { while (!closed.get()) { try { reporter.setStatus("closing"); Thread.sleep(1000); } catch (InterruptedException e) { continue; } catch (Throwable e) { return; } } } }; try { prog.start(); // optimize index if (indexConf.doOptimize()) { if (LOG.isInfoEnabled()) { LOG.info("Optimizing index."); } writer.optimize(); } // close index writer.close(); if (LOG.isInfoEnabled()) { LOG.info("Done indexing " + docCount + " docs."); } // copy to perm destination in dfs fs.completeLocalOutput(perm, temp); if (LOG.isInfoEnabled()) { LOG.info("Copy done."); } } finally { closed.set(true); } } }; }
From source file:org.jahia.modules.crawl.CrawlJob.java
License:Open Source License
protected void executeInternal(JobExecutionContext context) throws JobExecutionException { try {/*from w w w . ja v a2s . c o m*/ JobDataMap mergedJobDataMap = context.getMergedJobDataMap(); if (conf == null) { String baseDirPath = (String) mergedJobDataMap.get("baseDir"); if (StringUtils.isEmpty(baseDirPath)) { baseDirPath = System.getProperty("user.dir"); } String folderName = (String) mergedJobDataMap.get("folderName"); if (folderName == null) { folderName = "jahia-crawler"; } baseDir = new Path( baseDirPath + (StringUtils.isEmpty(folderName) ? "" : System.getProperty("file.separator")) + folderName); init(); } List<String> urls = (List<String>) mergedJobDataMap.get("urls"); JobConf job = new NutchJob(conf); Path tmpDir = job.getLocalPath("crawl" + Path.SEPARATOR + getDate()); CrawlDBUtil.generateSeedList(fs, urlPath, urls); // inject Injector injector = new Injector(conf); injector.inject(crawldbPath, urlPath); // generate Generator g = new Generator(conf); // fetch conf.setBoolean("fetcher.parse", true); Fetcher fetcher = new Fetcher(conf); ParseSegment parseSegment = new ParseSegment(conf); CrawlDb crawlDbTool = new CrawlDb(conf); int depth = 5; int threads = 4; int i; for (i = 0; i < depth; i++) { // generate new segment Path generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false); if (generatedSegment == null) { logger.info("Stopping at depth=" + i + " - no more URLs to fetch."); break; } fetcher.fetch(generatedSegment, threads, true); if (!Fetcher.isParsing(job)) { parseSegment.parse(generatedSegment); // parse it, if needed } crawlDbTool.update(crawldbPath, new Path[] { generatedSegment }, true, true); } if (i > 0) { LinkDb linkDbTool = new LinkDb(conf); Indexer indexer = new Indexer(conf); DeleteDuplicates dedup = new DeleteDuplicates(conf); IndexMerger merger = new IndexMerger(conf); linkDbTool.invert(linkDb, segments, true, true, false); // invert links if (indexes != null) { // Delete old indexes if (fs.exists(indexes)) { logger.info("Deleting old indexes: " + indexes); fs.delete(indexes, true); } // Delete old index if (fs.exists(index)) { logger.info("Deleting old merged index: " + index); fs.delete(index, true); } } // index, dedup & merge FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs)); indexer.index(indexes, crawldbPath, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats))); if (indexes != null) { dedup.dedup(new Path[] { indexes }); fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs)); merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir); } } else { logger.warn("No URLs to fetch - check your seed list and URL filters."); } } catch (IOException e) { logger.error("Exception while crawling", e); } }