Example usage for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri()

Source Link

Document

Convert this Path to a URI.

Usage

From source file:backup.namenode.NameNodeBackupBlockCheckProcessor.java

License:Apache License

private void addExtendedBlocksFromNameNode(BackupReportWriter writer,
        ExternalExtendedBlockSort<Addresses> nameNodeBlocks, DFSClient client, FileStatus fs,
        Set<Path> pathSetToIgnore) throws IOException {
    Path qualifiedPath = fileSystem.makeQualified(fs.getPath());
    if (shouldIgnore(pathSetToIgnore, qualifiedPath)) {
        return;/*w  w  w  . j av  a 2s. com*/
    }

    String src = qualifiedPath.toUri().getPath();
    long start = 0;
    long length = fs.getLen();

    LocatedBlocks locatedBlocks = client.getLocatedBlocks(src, start, length);
    for (LocatedBlock locatedBlock : locatedBlocks.getLocatedBlocks()) {
        DatanodeInfo[] locations = locatedBlock.getLocations();
        ExtendedBlock extendedBlock = BackupUtil.fromHadoop(locatedBlock.getBlock());
        Addresses addresses = new Addresses(locations);
        nameNodeBlocks.add(extendedBlock, addresses);
        writer.statusExtendedBlocksFromNameNode(src, extendedBlock, locations);
    }
}

From source file:be.ugent.intec.halvade.uploader.input.BaseFileReader.java

protected static BufferedReader getReader(boolean readFromDistributedStorage, String file)
        throws FileNotFoundException, IOException {
    InputStream hdfsIn;//from  ww  w  . ja va2s.com
    if (readFromDistributedStorage) {
        Path pt = new Path(file);
        FileSystem fs = FileSystem.get(pt.toUri(), new Configuration());
        hdfsIn = fs.open(pt);
        // read the stream in the correct format!
        if (file.endsWith(".gz")) {
            GZIPInputStream gzip = new GZIPInputStream(hdfsIn, BUFFERSIZE);
            return new BufferedReader(new InputStreamReader(gzip));
        } else if (file.endsWith(".bz2")) {
            CBZip2InputStream bzip2 = new CBZip2InputStream(hdfsIn);
            return new BufferedReader(new InputStreamReader(bzip2));
        } else
            return new BufferedReader(new InputStreamReader(hdfsIn));

    } else {
        if (file.endsWith(".gz")) {
            GZIPInputStream gzip = new GZIPInputStream(new FileInputStream(file), BUFFERSIZE);
            return new BufferedReader(new InputStreamReader(gzip));
        } else if (file.endsWith(".bz2")) {
            CBZip2InputStream bzip2 = new CBZip2InputStream(new FileInputStream(file));
            return new BufferedReader(new InputStreamReader(bzip2));
        } else if (file.equals("-")) {
            return new BufferedReader(new InputStreamReader(System.in));
        } else
            return new BufferedReader(new FileReader(file));
    }
}

From source file:be_uclouvain_ingi2145_lab05.GiraphJobRunner.java

@Override
public int run(String[] strings) throws Exception {

    GiraphConfiguration gconf = new GiraphConfiguration(conf);
    //gconf.setVertexClass(SimpleShortestPathsComputation.class);
    /*gconf.setVertexInputFormatClass(
        SimpleShortestPathsVertexInputFormat.class);
    gconf.setVertexOutputFormatClass(/*from w  ww . ja  v  a 2 s.  c om*/
        SimpleShortestPathsVertexOutputFormat.class);
    */
    CommandLine cmd = ConfigurationUtils.parseArgs(gconf, strings);
    if (null == cmd) {
        return 0;
    }

    //GiraphYarnClient job = new GiraphYarnClient(gconf,gconf.getClass().getName());
    GiraphJob job = new GiraphJob(gconf, getClass().getName());
    job.getInternalJob().setJarByClass(getClass());
    if (cmd.hasOption("vof") || cmd.hasOption("eof")) {
        if (cmd.hasOption("op")) {
            Path outputPath = new Path(cmd.getOptionValue("op"));

            FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
            /*Check if output path (args[1])exist or not*/
            if (fs.exists(outputPath)) {
                /*If exist delete the output path*/
                fs.delete(outputPath, true);
            }

            FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath);
        }
    }
    /*
    if (cmd.hasOption("vif") || cmd.hasOption("eif")) {
      if (cmd.hasOption("vip")) {
          FileInputFormat.addInputPath(job.getInternalJob(), new Path(cmd.getOptionValue("op")));
      }
    }*/
    //If there is a custom option specified
    if (cmd.hasOption("ca")) {
        String[] args = cmd.getOptionValues("ca");
        LOG.fatal("" + Arrays.toString(args));

        gconf.set("ca", args[0].split("=")[1]);
        LOG.fatal("" + gconf.get("ca"));
        gconf.setWorkerConfiguration(Integer.parseInt(cmd.getOptionValue("w")),
                Integer.parseInt(cmd.getOptionValue("w")), 100.0f);
    }
    /*
    if (cmd.hasOption("cf")) {
      DistributedCache.addCacheFile(new URI(cmd.getOptionValue("cf")),
          job.getConfiguration());
    }
    */
    return job.run(true) ? 0 : -1;
}

From source file:bixo.examples.crawl.DemoCrawlTool.java

License:Apache License

public static void importOneDomain(String targetDomain, Path crawlDbPath, JobConf conf) throws Exception {

    try {//from  ww  w .  ja  v  a2  s  .  com
        Tap urlSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toUri().toString(), true);
        TupleEntryCollector writer = urlSink.openForWrite(conf);
        SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();

        CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize("http://" + targetDomain), 0, 0,
                UrlStatus.UNFETCHED, 0);

        writer.add(datum.getTuple());
        writer.close();
    } catch (Exception e) {
        HadoopUtils.safeRemove(crawlDbPath.getFileSystem(conf), crawlDbPath);
        throw e;
    }
}

From source file:bixo.examples.crawl.DemoStatusTool.java

License:Apache License

private static void processStatus(JobConf conf, Path curDirPath) throws IOException {
    Path statusPath = new Path(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusTap = new Hfs(new TextLine(), statusPath.toUri().toString());

    TupleEntryIterator iter = statusTap.openForRead(conf);

    LOGGER.info("Analyzing: " + CrawlConfig.STATUS_SUBDIR_NAME);
    UrlStatus[] statusValues = UrlStatus.values();
    int[] statusCounts = new int[statusValues.length];
    int totalEntries = 0;
    while (iter.hasNext()) {
        TupleEntry entry = iter.next();/*from   ww  w .  j  a  va 2  s  .c  om*/
        totalEntries += 1;

        String statusLine = entry.getString("line");
        String[] pieces = statusLine.split("\t");
        int pos = StatusDatum.FIELDS.getPos(StatusDatum.STATUS_FN);
        UrlStatus status = UrlStatus.valueOf(pieces[pos]);
        statusCounts[status.ordinal()] += 1;
    }

    for (int i = 0; i < statusCounts.length; i++) {
        if (statusCounts[i] != 0) {
            LOGGER.info(String.format("Status %s: %d", statusValues[i].toString(), statusCounts[i]));
        }
    }
    LOGGER.info("Total status: " + totalEntries);
    LOGGER.info("");
}

From source file:bixo.examples.crawl.DemoStatusTool.java

License:Apache License

private static void processCrawlDb(JobConf conf, Path curDirPath, boolean exportDb) throws IOException {
    TupleEntryIterator iter;//from  www  . j  av  a 2 s  .co m
    int totalEntries;
    Path crawlDbPath = new Path(curDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toUri().toString());
    iter = crawldbTap.openForRead(conf);
    totalEntries = 0;
    int fetchedUrls = 0;
    int unfetchedUrls = 0;

    LOGGER.info("Analyzing: " + CrawlConfig.CRAWLDB_SUBDIR_NAME);

    while (iter.hasNext()) {
        TupleEntry entry = iter.next();
        totalEntries += 1;

        CrawlDbDatum datum = new CrawlDbDatum(entry);
        if (exportDb) {
            LOGGER.info(datum.toString());
        }
        if (datum.getLastFetched() == 0) {
            unfetchedUrls += 1;
        } else {
            fetchedUrls += 1;
        }
    }
    if (!exportDb) {
        LOGGER.info(String.format("%d fetched URLs", fetchedUrls));
        LOGGER.info(String.format("%d unfetched URLs", unfetchedUrls));
        LOGGER.info("Total URLs: " + totalEntries);
        LOGGER.info("");
    }
}

From source file:bixo.examples.crawl.DemoStatusTool.java

License:Apache License

public static void main(String[] args) {
    DemoStatusToolOptions options = new DemoStatusToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from  ww  w .  j  av a  2  s . c  om
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    String crawlDirName = options.getWorkingDir();

    try {
        JobConf conf = new JobConf();
        Path crawlDirPath = new Path(crawlDirName);
        FileSystem fs = crawlDirPath.getFileSystem(conf);

        if (!fs.exists(crawlDirPath)) {
            System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
            System.exit(-1);
        }

        // Skip Hadoop/Cascading DEBUG messages.
        Logger.getRootLogger().setLevel(Level.INFO);

        boolean exportDb = options.isExportDb();
        if (exportDb) {
            Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath);
            processCrawlDb(conf, latestCrawlDirPath, exportDb);
        } else {
            int prevLoop = -1;
            Path curDirPath = null;
            while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) {
                String curDirName = curDirPath.toUri().toString();
                LOGGER.info("");
                LOGGER.info("================================================================");
                LOGGER.info("Processing " + curDirName);
                LOGGER.info("================================================================");

                int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath);
                if (curLoop != prevLoop + 1) {
                    LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop));
                }

                prevLoop = curLoop;

                // Process the status and crawldb in curPath
                processStatus(conf, curDirPath);
                processCrawlDb(conf, curDirPath, exportDb);

            }
        }
    } catch (Throwable t) {
        LOGGER.error("Exception running tool", t);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.SimpleCrawlTool.java

License:Apache License

public static void main(String[] args) {
    SimpleCrawlToolOptions options = new SimpleCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from w  w w . j  ava  2  s.co  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /urls subdir
            // In the /urls dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.toUri().toString();
            setLoopLoggerFile(curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        if (domain != null) {
            urlFilter = new DomainUrlFilter(domain);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.toUri().toString();
            setLoopLoggerFile(curLoopDirName, curLoop);

            Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.SimpleStatusTool.java

License:Apache License

private static void processStatus(JobConf conf, Path curDirPath) throws IOException {
    Path statusPath = new Path(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusTap = new Hfs(new TextLine(), statusPath.toUri().toString());

    TupleEntryIterator iter = statusTap.openForRead(conf);

    UrlStatus[] statusValues = UrlStatus.values();
    int[] statusCounts = new int[statusValues.length];
    int totalEntries = 0;
    while (iter.hasNext()) {
        TupleEntry entry = iter.next();/*from  w w w  .j a v a 2s .  co m*/
        totalEntries += 1;

        // URL_FN, STATUS_FN, HEADERS_FN, EXCEPTION_FN, STATUS_TIME_FN, HOST_ADDRESS_FN).append(getSuperFields(StatusDatum.class)
        String statusLine = entry.getString("line");
        String[] pieces = statusLine.split("\t");
        UrlStatus status = UrlStatus.valueOf(pieces[1]);
        statusCounts[status.ordinal()] += 1;
    }

    for (int i = 0; i < statusCounts.length; i++) {
        if (statusCounts[i] != 0) {
            LOGGER.info(String.format("Status %s: %d", statusValues[i].toString(), statusCounts[i]));
        }
    }
    LOGGER.info("Total status: " + totalEntries);
    LOGGER.info("");
}

From source file:bixo.examples.crawl.SimpleStatusTool.java

License:Apache License

private static void processCrawlDb(JobConf conf, Path curDirPath, boolean exportDb) throws IOException {
    TupleEntryIterator iter;//  ww w .j  a v  a 2  s .  c o m
    int totalEntries;
    Path crawlDbPath = new Path(curDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toUri().toString());
    iter = crawldbTap.openForRead(conf);
    totalEntries = 0;
    int fetchedUrls = 0;
    int unfetchedUrls = 0;

    while (iter.hasNext()) {
        TupleEntry entry = iter.next();
        totalEntries += 1;

        CrawlDbDatum datum = new CrawlDbDatum(entry);
        if (exportDb) {
            LOGGER.info(datum.toString());
        }
        if (datum.getLastFetched() == 0) {
            unfetchedUrls += 1;
        } else {
            fetchedUrls += 1;
        }
    }
    if (!exportDb) {
        LOGGER.info(String.format("%d fetched URLs", fetchedUrls));
        LOGGER.info(String.format("%d unfetched URLs", unfetchedUrls));
        LOGGER.info("Total URLs: " + totalEntries);
        LOGGER.info("");
    }
}