List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:backup.namenode.NameNodeBackupBlockCheckProcessor.java
License:Apache License
private void addExtendedBlocksFromNameNode(BackupReportWriter writer, ExternalExtendedBlockSort<Addresses> nameNodeBlocks, DFSClient client, FileStatus fs, Set<Path> pathSetToIgnore) throws IOException { Path qualifiedPath = fileSystem.makeQualified(fs.getPath()); if (shouldIgnore(pathSetToIgnore, qualifiedPath)) { return;/*w w w . j av a 2s. com*/ } String src = qualifiedPath.toUri().getPath(); long start = 0; long length = fs.getLen(); LocatedBlocks locatedBlocks = client.getLocatedBlocks(src, start, length); for (LocatedBlock locatedBlock : locatedBlocks.getLocatedBlocks()) { DatanodeInfo[] locations = locatedBlock.getLocations(); ExtendedBlock extendedBlock = BackupUtil.fromHadoop(locatedBlock.getBlock()); Addresses addresses = new Addresses(locations); nameNodeBlocks.add(extendedBlock, addresses); writer.statusExtendedBlocksFromNameNode(src, extendedBlock, locations); } }
From source file:be.ugent.intec.halvade.uploader.input.BaseFileReader.java
protected static BufferedReader getReader(boolean readFromDistributedStorage, String file) throws FileNotFoundException, IOException { InputStream hdfsIn;//from ww w . ja va2s.com if (readFromDistributedStorage) { Path pt = new Path(file); FileSystem fs = FileSystem.get(pt.toUri(), new Configuration()); hdfsIn = fs.open(pt); // read the stream in the correct format! if (file.endsWith(".gz")) { GZIPInputStream gzip = new GZIPInputStream(hdfsIn, BUFFERSIZE); return new BufferedReader(new InputStreamReader(gzip)); } else if (file.endsWith(".bz2")) { CBZip2InputStream bzip2 = new CBZip2InputStream(hdfsIn); return new BufferedReader(new InputStreamReader(bzip2)); } else return new BufferedReader(new InputStreamReader(hdfsIn)); } else { if (file.endsWith(".gz")) { GZIPInputStream gzip = new GZIPInputStream(new FileInputStream(file), BUFFERSIZE); return new BufferedReader(new InputStreamReader(gzip)); } else if (file.endsWith(".bz2")) { CBZip2InputStream bzip2 = new CBZip2InputStream(new FileInputStream(file)); return new BufferedReader(new InputStreamReader(bzip2)); } else if (file.equals("-")) { return new BufferedReader(new InputStreamReader(System.in)); } else return new BufferedReader(new FileReader(file)); } }
From source file:be_uclouvain_ingi2145_lab05.GiraphJobRunner.java
@Override public int run(String[] strings) throws Exception { GiraphConfiguration gconf = new GiraphConfiguration(conf); //gconf.setVertexClass(SimpleShortestPathsComputation.class); /*gconf.setVertexInputFormatClass( SimpleShortestPathsVertexInputFormat.class); gconf.setVertexOutputFormatClass(/*from w ww . ja v a 2 s. c om*/ SimpleShortestPathsVertexOutputFormat.class); */ CommandLine cmd = ConfigurationUtils.parseArgs(gconf, strings); if (null == cmd) { return 0; } //GiraphYarnClient job = new GiraphYarnClient(gconf,gconf.getClass().getName()); GiraphJob job = new GiraphJob(gconf, getClass().getName()); job.getInternalJob().setJarByClass(getClass()); if (cmd.hasOption("vof") || cmd.hasOption("eof")) { if (cmd.hasOption("op")) { Path outputPath = new Path(cmd.getOptionValue("op")); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); /*Check if output path (args[1])exist or not*/ if (fs.exists(outputPath)) { /*If exist delete the output path*/ fs.delete(outputPath, true); } FileOutputFormat.setOutputPath(job.getInternalJob(), outputPath); } } /* if (cmd.hasOption("vif") || cmd.hasOption("eif")) { if (cmd.hasOption("vip")) { FileInputFormat.addInputPath(job.getInternalJob(), new Path(cmd.getOptionValue("op"))); } }*/ //If there is a custom option specified if (cmd.hasOption("ca")) { String[] args = cmd.getOptionValues("ca"); LOG.fatal("" + Arrays.toString(args)); gconf.set("ca", args[0].split("=")[1]); LOG.fatal("" + gconf.get("ca")); gconf.setWorkerConfiguration(Integer.parseInt(cmd.getOptionValue("w")), Integer.parseInt(cmd.getOptionValue("w")), 100.0f); } /* if (cmd.hasOption("cf")) { DistributedCache.addCacheFile(new URI(cmd.getOptionValue("cf")), job.getConfiguration()); } */ return job.run(true) ? 0 : -1; }
From source file:bixo.examples.crawl.DemoCrawlTool.java
License:Apache License
public static void importOneDomain(String targetDomain, Path crawlDbPath, JobConf conf) throws Exception { try {//from ww w . ja v a2 s . com Tap urlSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toUri().toString(), true); TupleEntryCollector writer = urlSink.openForWrite(conf); SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer(); CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize("http://" + targetDomain), 0, 0, UrlStatus.UNFETCHED, 0); writer.add(datum.getTuple()); writer.close(); } catch (Exception e) { HadoopUtils.safeRemove(crawlDbPath.getFileSystem(conf), crawlDbPath); throw e; } }
From source file:bixo.examples.crawl.DemoStatusTool.java
License:Apache License
private static void processStatus(JobConf conf, Path curDirPath) throws IOException { Path statusPath = new Path(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusTap = new Hfs(new TextLine(), statusPath.toUri().toString()); TupleEntryIterator iter = statusTap.openForRead(conf); LOGGER.info("Analyzing: " + CrawlConfig.STATUS_SUBDIR_NAME); UrlStatus[] statusValues = UrlStatus.values(); int[] statusCounts = new int[statusValues.length]; int totalEntries = 0; while (iter.hasNext()) { TupleEntry entry = iter.next();/*from ww w . j a va 2 s .c om*/ totalEntries += 1; String statusLine = entry.getString("line"); String[] pieces = statusLine.split("\t"); int pos = StatusDatum.FIELDS.getPos(StatusDatum.STATUS_FN); UrlStatus status = UrlStatus.valueOf(pieces[pos]); statusCounts[status.ordinal()] += 1; } for (int i = 0; i < statusCounts.length; i++) { if (statusCounts[i] != 0) { LOGGER.info(String.format("Status %s: %d", statusValues[i].toString(), statusCounts[i])); } } LOGGER.info("Total status: " + totalEntries); LOGGER.info(""); }
From source file:bixo.examples.crawl.DemoStatusTool.java
License:Apache License
private static void processCrawlDb(JobConf conf, Path curDirPath, boolean exportDb) throws IOException { TupleEntryIterator iter;//from www . j av a 2 s .co m int totalEntries; Path crawlDbPath = new Path(curDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toUri().toString()); iter = crawldbTap.openForRead(conf); totalEntries = 0; int fetchedUrls = 0; int unfetchedUrls = 0; LOGGER.info("Analyzing: " + CrawlConfig.CRAWLDB_SUBDIR_NAME); while (iter.hasNext()) { TupleEntry entry = iter.next(); totalEntries += 1; CrawlDbDatum datum = new CrawlDbDatum(entry); if (exportDb) { LOGGER.info(datum.toString()); } if (datum.getLastFetched() == 0) { unfetchedUrls += 1; } else { fetchedUrls += 1; } } if (!exportDb) { LOGGER.info(String.format("%d fetched URLs", fetchedUrls)); LOGGER.info(String.format("%d unfetched URLs", unfetchedUrls)); LOGGER.info("Total URLs: " + totalEntries); LOGGER.info(""); } }
From source file:bixo.examples.crawl.DemoStatusTool.java
License:Apache License
public static void main(String[] args) { DemoStatusToolOptions options = new DemoStatusToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from ww w . j av a 2 s . c om parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } String crawlDirName = options.getWorkingDir(); try { JobConf conf = new JobConf(); Path crawlDirPath = new Path(crawlDirName); FileSystem fs = crawlDirPath.getFileSystem(conf); if (!fs.exists(crawlDirPath)) { System.err.println("Prior crawl output directory does not exist: " + crawlDirName); System.exit(-1); } // Skip Hadoop/Cascading DEBUG messages. Logger.getRootLogger().setLevel(Level.INFO); boolean exportDb = options.isExportDb(); if (exportDb) { Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath); processCrawlDb(conf, latestCrawlDirPath, exportDb); } else { int prevLoop = -1; Path curDirPath = null; while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) { String curDirName = curDirPath.toUri().toString(); LOGGER.info(""); LOGGER.info("================================================================"); LOGGER.info("Processing " + curDirName); LOGGER.info("================================================================"); int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath); if (curLoop != prevLoop + 1) { LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop)); } prevLoop = curLoop; // Process the status and crawldb in curPath processStatus(conf, curDirPath); processCrawlDb(conf, curDirPath, exportDb); } } } catch (Throwable t) { LOGGER.error("Exception running tool", t); System.exit(-1); } }
From source file:bixo.examples.crawl.SimpleCrawlTool.java
License:Apache License
public static void main(String[] args) { SimpleCrawlToolOptions options = new SimpleCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {//from w w w . j ava 2 s.co m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /urls subdir // In the /urls dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; if (domain != null) { urlFilter = new DomainUrlFilter(domain); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.crawl.SimpleStatusTool.java
License:Apache License
private static void processStatus(JobConf conf, Path curDirPath) throws IOException { Path statusPath = new Path(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusTap = new Hfs(new TextLine(), statusPath.toUri().toString()); TupleEntryIterator iter = statusTap.openForRead(conf); UrlStatus[] statusValues = UrlStatus.values(); int[] statusCounts = new int[statusValues.length]; int totalEntries = 0; while (iter.hasNext()) { TupleEntry entry = iter.next();/*from w w w .j a v a 2s . co m*/ totalEntries += 1; // URL_FN, STATUS_FN, HEADERS_FN, EXCEPTION_FN, STATUS_TIME_FN, HOST_ADDRESS_FN).append(getSuperFields(StatusDatum.class) String statusLine = entry.getString("line"); String[] pieces = statusLine.split("\t"); UrlStatus status = UrlStatus.valueOf(pieces[1]); statusCounts[status.ordinal()] += 1; } for (int i = 0; i < statusCounts.length; i++) { if (statusCounts[i] != 0) { LOGGER.info(String.format("Status %s: %d", statusValues[i].toString(), statusCounts[i])); } } LOGGER.info("Total status: " + totalEntries); LOGGER.info(""); }
From source file:bixo.examples.crawl.SimpleStatusTool.java
License:Apache License
private static void processCrawlDb(JobConf conf, Path curDirPath, boolean exportDb) throws IOException { TupleEntryIterator iter;// ww w .j a v a 2 s . c o m int totalEntries; Path crawlDbPath = new Path(curDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); Tap crawldbTap = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toUri().toString()); iter = crawldbTap.openForRead(conf); totalEntries = 0; int fetchedUrls = 0; int unfetchedUrls = 0; while (iter.hasNext()) { TupleEntry entry = iter.next(); totalEntries += 1; CrawlDbDatum datum = new CrawlDbDatum(entry); if (exportDb) { LOGGER.info(datum.toString()); } if (datum.getLastFetched() == 0) { unfetchedUrls += 1; } else { fetchedUrls += 1; } } if (!exportDb) { LOGGER.info(String.format("%d fetched URLs", fetchedUrls)); LOGGER.info(String.format("%d unfetched URLs", unfetchedUrls)); LOGGER.info("Total URLs: " + totalEntries); LOGGER.info(""); } }