Example usage for org.apache.hadoop.mapred JobConf getLocalPath

List of usage examples for org.apache.hadoop.mapred JobConf getLocalPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf getLocalPath.

Prototype

public Path getLocalPath(String pathString) throws IOException 

Source Link

Document

Constructs a local file name.

Usage

From source file:de.tudarmstadt.ukp.dkpro.bigdata.hadoop.UIMAMapReduceBase.java

License:Open Source License

@Override
public void configure(JobConf job) {
    try {/*  ww w  .jav a2s  . c  o m*/
        this.job = job;
        this.mapOutputValueClass = job.getMapOutputValueClass();
        this.outputValueClass = job.getOutputValueClass();
        this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100);
        final EngineFactory engineFactory = (EngineFactory) Class
                .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance();
        engineFactory.configure(job);

        final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job);

        // replace the $dir variable within the configuration.
        this.fs = FileSystem.get(job);
        this.localFS = FileSystem.getLocal(job);
        this.working_dir = new Path("uima_output_" + job.get("mapred.task.id"));
        final Path outputPath = FileOutputFormat.getOutputPath(job);
        this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName()));
        this.localFS.mkdirs(this.results_dir);
        final String[] resources = job.get("dkpro.resources", "").split(",");
        sLogger.info("Writing local data to: " + this.results_dir);
        this.resourceURIs = new TreeMap<String, URL>();
        for (final String resource : resources) {
            final URL r = job.getResource(resource);
            if (r != null && !resource.isEmpty()) {
                this.resourceURIs.put(resource, r);
            }

        }
        replaceRecursively(engineDescription);
        this.engine = createEngine(engineDescription);

    } catch (final Exception e) {
        sLogger.fatal("Error while configuring pipeline", e);
        e.printStackTrace();
        throw new RuntimeException(e);
    }

}

From source file:org.apache.nutch.crawl.Crawl.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.out.println("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
        return -1;
    }/*from w  ww .  j ava  2s  . com*/
    Path rootUrlDir = null;
    Path dir = new Path("crawl-" + getDate());
    int threads = getConf().getInt("fetcher.threads.fetch", 10);
    int depth = 5;
    long topN = Long.MAX_VALUE;
    String solrUrl = null;

    for (int i = 0; i < args.length; i++) {
        if ("-dir".equals(args[i])) {
            dir = new Path(args[i + 1]);
            i++;
        } else if ("-threads".equals(args[i])) {
            threads = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-depth".equals(args[i])) {
            depth = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-topN".equals(args[i])) {
            topN = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-solr".equals(args[i])) {
            solrUrl = args[i + 1];
            i++;
        } else if (args[i] != null) {
            rootUrlDir = new Path(args[i]);
        }
    }

    JobConf job = new NutchJob(getConf());

    if (solrUrl == null) {
        LOG.warn("solrUrl is not set, indexing will be skipped...");
    }

    FileSystem fs = FileSystem.get(job);

    if (LOG.isInfoEnabled()) {
        LOG.info("crawl started in: " + dir);
        LOG.info("rootUrlDir = " + rootUrlDir);
        LOG.info("threads = " + threads);
        LOG.info("depth = " + depth);
        LOG.info("solrUrl=" + solrUrl);
        if (topN != Long.MAX_VALUE)
            LOG.info("topN = " + topN);
    }

    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    Path indexes = new Path(dir + "/indexes");
    Path index = new Path(dir + "/index");

    Path tmpDir = job.getLocalPath("crawl" + Path.SEPARATOR + getDate());
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());

    // initialize crawlDb
    injector.inject(crawlDb, rootUrlDir);
    int i;
    for (i = 0; i < depth; i++) { // generate new segment
        Path[] segs = generator.generate(crawlDb, segments, -1, topN, System.currentTimeMillis());
        if (segs == null) {
            LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
            break;
        }
        fetcher.fetch(segs[0], threads); // fetch it
        if (!Fetcher.isParsing(job)) {
            parseSegment.parse(segs[0]); // parse it, if needed
        }
        crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
    }
    if (i > 0) {
        linkDbTool.invert(linkDb, segments, true, true, false); // invert links

        if (solrUrl != null) {
            // index, dedup & merge
            FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
            SolrIndexer indexer = new SolrIndexer(getConf());
            indexer.indexSolr(solrUrl, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats)));
            SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
            dedup.setConf(getConf());
            dedup.dedup(solrUrl);
        }

    } else {
        LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("crawl finished: " + dir);
    }
    return 0;
}

From source file:org.apache.nutch.indexer.lucene.LuceneWriter.java

License:Apache License

public void open(JobConf job, String name) throws IOException {
    this.fs = FileSystem.get(job);
    perm = new Path(FileOutputFormat.getOutputPath(job), name);
    temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt()));

    fs.delete(perm, true); // delete old, if any
    analyzerFactory = new AnalyzerFactory(job);
    writer = new IndexWriter(FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())),
            new NutchDocumentAnalyzer(job), true, MaxFieldLength.UNLIMITED);

    writer.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
    writer.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
    writer.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
    writer.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
    writer.setMaxFieldLength(job.getInt("indexer.max.tokens", 10000));
    writer.setInfoStream(LogUtil.getDebugStream(Indexer.LOG));
    writer.setUseCompoundFile(false);/*  w w w  . j a  v a  2  s  .c  o m*/
    writer.setSimilarity(new NutchSimilarity());

    processOptions(job);
}

From source file:org.apache.nutch.indexwriter.lucene.LuceneWriter.java

License:Apache License

public void open(JobConf job, String name) throws IOException {
    this.fs = FileSystem.get(job);
    perm = new Path(FileOutputFormat.getOutputPath(job), name);
    temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt()));
    fs.delete(perm, true); // delete old, if any
    analyzerFactory = new AnalyzerFactory(job);
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_2,
            new SmartChineseAnalyzer());
    LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy();
    mergePolicy.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
    mergePolicy.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));

    indexWriterConfig.setMergePolicy(mergePolicy);
    indexWriterConfig.setUseCompoundFile(false);
    indexWriterConfig.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
    indexWriterConfig.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
    indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    writer = new org.apache.lucene.index.IndexWriter(
            FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), indexWriterConfig);

    /*//from   w  ww.  j  a  va2 s  . c  o m
     * addFieldOptions("title", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job);
     * addFieldOptions("url", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job);
     * addFieldOptions("content", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job);
     * addFieldOptions("lang", STORE.YES, INDEX.UNTOKENIZED, VECTOR.NO, job);
     */

    processOptions(job);
}

From source file:org.archive.jbs.lucene.LuceneOutputFormat.java

License:Apache License

public RecordWriter<Text, Text> getRecordWriter(final FileSystem fs, final JobConf job, final String name,
        final Progressable progress) throws IOException {
    // Open Lucene index in ${temp}
    this.fs = FileSystem.get(job);
    this.job = job;
    this.perm = new Path(FileOutputFormat.getOutputPath(job), name);
    this.temp = job.getLocalPath("index/_" + (new Random().nextInt()));

    this.fs.delete(perm, true); // delete old, if any

    indexer = new IndexWriter(new NIOFSDirectory(new File(fs.startLocalOutput(perm, temp).toString())),
            new KeywordAnalyzer(), IndexWriter.MaxFieldLength.UNLIMITED);

    indexer.setMergeFactor(job.getInt("jbs.lucene.mergeFactor", 100));
    indexer.setMaxMergeDocs(job.getInt("jbs.lucene.maxMergeDocs", Integer.MAX_VALUE));
    indexer.setRAMBufferSizeMB(job.getInt("jbs.lucene.maxRAMBufferSize", 512));
    indexer.setTermIndexInterval(/*  w w w  .j  ava2  s. c o m*/
            job.getInt("jbs.lucene.termIndexInterval", IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL));
    indexer.setMaxFieldLength(job.getInt("jbs.lucene.max.tokens", Integer.MAX_VALUE));
    indexer.setUseCompoundFile(false);
    indexer.setSimilarity(new WebSimilarity());

    LuceneDocumentWriter docWriter = buildDocumentWriter(job, indexer);

    return new LuceneRecordWriter(docWriter);
}

From source file:org.dkpro.bigdata.hadoop.UIMAMapReduceBase.java

License:Open Source License

@Override
public void configure(JobConf job) {
    try {//from  w ww  .  j ava2s  .  com
        this.job = job;
        this.inputName = job.get("mapred.input.dir");
        this.taskId = job.get("mapred.task.id");
        this.mapOutputValueClass = job.getMapOutputValueClass();
        this.outputValueClass = job.getOutputValueClass();
        this.samplingPropability = job.getInt("dkpro.map.samplingratio", 100);
        final EngineFactory engineFactory = (EngineFactory) Class
                .forName(job.get("dkpro.uima.factory", DkproHadoopDriver.class.getName())).newInstance();
        engineFactory.configure(job);

        final AnalysisEngineDescription engineDescription = getEngineDescription(engineFactory, job);

        // replace the $dir variable within the configuration.
        this.fs = FileSystem.get(job);
        this.localFS = FileSystem.getLocal(job);
        if (job.getBoolean("dkpro.output.onedirpertask", true)) {
            this.working_dir = new Path("uima_output_" + job.get("mapred.task.id"));
        } else {
            this.working_dir = new Path("uima_output");
        }
        final Path outputPath = FileOutputFormat.getOutputPath(job);
        this.results_dir = this.fs.startLocalOutput(outputPath, job.getLocalPath(this.working_dir.getName()));
        this.localFS.mkdirs(this.results_dir);
        final String[] resources = job.get("dkpro.resources", "").split(",");
        sLogger.info("Writing local data to: " + this.results_dir);
        this.resourceURIs = new TreeMap<String, URL>();
        for (final String resource : resources) {
            final URL r = job.getResource(resource);
            if (r != null && !resource.isEmpty()) {
                this.resourceURIs.put(resource, r);
            }

        }
        Map<String, String> variableValues = new HashMap<String, String>();
        variableValues.put("\\$dir", this.results_dir.toString());
        variableValues.put("\\$input", this.inputName);
        variableValues.put("\\$taskid", this.taskId);
        Path[] cacheFiles = DistributedCache.getLocalCacheFiles(job);
        if (cacheFiles != null) {
            for (Path cacheFile : cacheFiles) {
                variableValues.put("^\\$cache/" + cacheFile.getName(), cacheFile.toUri().getPath());
            }
        }
        for (final Entry<String, URL> resource : this.resourceURIs.entrySet()) {
            variableValues.put("\\$" + resource, resource.getValue().toString());
        }
        AnalysisEngineUtil.replaceVariables(engineDescription, variableValues);
        this.engine = createEngine(engineDescription);

    } catch (final Exception e) {
        sLogger.fatal("Error while configuring pipeline", e);
        e.printStackTrace();
        throw new RuntimeException(e);
    }

}

From source file:org.hbasene.index.create.mapred.IndexOutputFormat.java

License:Apache License

@Override
public RecordWriter<ImmutableBytesWritable, LuceneDocumentWrapper> getRecordWriter(final FileSystem fs,
        JobConf job, String name, final Progressable progress) throws IOException {

    final Path perm = new Path(FileOutputFormat.getOutputPath(job), name);
    final Path temp = job.getLocalPath("index/_" + Integer.toString(random.nextInt()));

    LOG.info("To index into " + perm);

    // delete old, if any
    fs.delete(perm, true);/*w w  w .  j  a v  a  2  s  . co  m*/

    final IndexConfiguration indexConf = new IndexConfiguration();
    String content = job.get("hbase.index.conf");
    if (content != null) {
        indexConf.addFromXML(content);
    }

    String analyzerName = indexConf.getAnalyzerName();
    Analyzer analyzer;
    try {
        Class<? extends Analyzer> analyzerClass = Class.forName(analyzerName).asSubclass(Analyzer.class);
        Constructor<? extends Analyzer> analyzerCtor = analyzerClass.getConstructor(Version.class);

        analyzer = analyzerCtor.newInstance(Version.LUCENE_30);
    } catch (Exception e) {
        throw new IOException("Error in creating an analyzer object " + analyzerName);
    }

    // build locally first
    final IndexWriter writer = new IndexWriter(
            FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), analyzer, true,
            IndexWriter.MaxFieldLength.LIMITED);

    // no delete, so no need for maxBufferedDeleteTerms
    writer.setMaxBufferedDocs(indexConf.getMaxBufferedDocs());
    writer.setMaxFieldLength(indexConf.getMaxFieldLength());
    writer.setMaxMergeDocs(indexConf.getMaxMergeDocs());
    writer.setMergeFactor(indexConf.getMergeFactor());
    String similarityName = indexConf.getSimilarityName();
    if (similarityName != null) {
        try {
            Class<? extends Similarity> similarityClass = Class.forName(similarityName)
                    .asSubclass(Similarity.class);
            Constructor<? extends Similarity> ctor = similarityClass.getConstructor(Version.class);
            Similarity similarity = ctor.newInstance(Version.LUCENE_30);
            writer.setSimilarity(similarity);
        } catch (Exception e) {
            throw new IOException("Error in creating a similarity object " + similarityName);
        }
    }
    writer.setUseCompoundFile(indexConf.isUseCompoundFile());

    return new RecordWriter<ImmutableBytesWritable, LuceneDocumentWrapper>() {
        AtomicBoolean closed = new AtomicBoolean(false);
        private long docCount = 0;

        public void write(ImmutableBytesWritable key, LuceneDocumentWrapper value) throws IOException {
            // unwrap and index doc
            Document doc = value.get();
            writer.addDocument(doc);
            docCount++;
            progress.progress();
        }

        public void close(final Reporter reporter) throws IOException {
            // spawn a thread to give progress heartbeats
            Thread prog = new Thread() {
                @Override
                public void run() {
                    while (!closed.get()) {
                        try {
                            reporter.setStatus("closing");
                            Thread.sleep(1000);
                        } catch (InterruptedException e) {
                            continue;
                        } catch (Throwable e) {
                            return;
                        }
                    }
                }
            };

            try {
                prog.start();

                // optimize index
                if (indexConf.doOptimize()) {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("Optimizing index.");
                    }
                    writer.optimize();
                }

                // close index
                writer.close();
                if (LOG.isInfoEnabled()) {
                    LOG.info("Done indexing " + docCount + " docs.");
                }

                // copy to perm destination in dfs
                fs.completeLocalOutput(perm, temp);
                if (LOG.isInfoEnabled()) {
                    LOG.info("Copy done.");
                }
            } finally {
                closed.set(true);
            }
        }
    };
}

From source file:org.jahia.modules.crawl.CrawlJob.java

License:Open Source License

protected void executeInternal(JobExecutionContext context) throws JobExecutionException {
    try {/*from   w  w w  .  ja v a2s  .  c  o  m*/
        JobDataMap mergedJobDataMap = context.getMergedJobDataMap();
        if (conf == null) {
            String baseDirPath = (String) mergedJobDataMap.get("baseDir");
            if (StringUtils.isEmpty(baseDirPath)) {
                baseDirPath = System.getProperty("user.dir");
            }
            String folderName = (String) mergedJobDataMap.get("folderName");
            if (folderName == null) {
                folderName = "jahia-crawler";
            }
            baseDir = new Path(
                    baseDirPath + (StringUtils.isEmpty(folderName) ? "" : System.getProperty("file.separator"))
                            + folderName);
            init();
        }

        List<String> urls = (List<String>) mergedJobDataMap.get("urls");

        JobConf job = new NutchJob(conf);

        Path tmpDir = job.getLocalPath("crawl" + Path.SEPARATOR + getDate());

        CrawlDBUtil.generateSeedList(fs, urlPath, urls);
        // inject
        Injector injector = new Injector(conf);
        injector.inject(crawldbPath, urlPath);

        // generate
        Generator g = new Generator(conf);
        // fetch
        conf.setBoolean("fetcher.parse", true);
        Fetcher fetcher = new Fetcher(conf);
        ParseSegment parseSegment = new ParseSegment(conf);
        CrawlDb crawlDbTool = new CrawlDb(conf);

        int depth = 5;
        int threads = 4;
        int i;
        for (i = 0; i < depth; i++) { // generate new segment
            Path generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE,
                    false, false);

            if (generatedSegment == null) {
                logger.info("Stopping at depth=" + i + " - no more URLs to fetch.");
                break;
            }
            fetcher.fetch(generatedSegment, threads, true);
            if (!Fetcher.isParsing(job)) {
                parseSegment.parse(generatedSegment); // parse it, if needed
            }
            crawlDbTool.update(crawldbPath, new Path[] { generatedSegment }, true, true);
        }
        if (i > 0) {
            LinkDb linkDbTool = new LinkDb(conf);
            Indexer indexer = new Indexer(conf);
            DeleteDuplicates dedup = new DeleteDuplicates(conf);
            IndexMerger merger = new IndexMerger(conf);

            linkDbTool.invert(linkDb, segments, true, true, false); // invert links

            if (indexes != null) {
                // Delete old indexes
                if (fs.exists(indexes)) {
                    logger.info("Deleting old indexes: " + indexes);
                    fs.delete(indexes, true);
                }

                // Delete old index
                if (fs.exists(index)) {
                    logger.info("Deleting old merged index: " + index);
                    fs.delete(index, true);
                }
            }

            // index, dedup & merge
            FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
            indexer.index(indexes, crawldbPath, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats)));
            if (indexes != null) {
                dedup.dedup(new Path[] { indexes });
                fstats = fs.listStatus(indexes, HadoopFSUtil.getPassDirectoriesFilter(fs));
                merger.merge(HadoopFSUtil.getPaths(fstats), index, tmpDir);
            }
        } else {
            logger.warn("No URLs to fetch - check your seed list and URL filters.");
        }

    } catch (IOException e) {
        logger.error("Exception while crawling", e);
    }
}