List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:bixo.examples.crawl.SimpleStatusTool.java
License:Apache License
public static void main(String[] args) { SimpleStatusToolOptions options = new SimpleStatusToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {// ww w .j a v a 2 s. co m parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } String crawlDirName = options.getCrawlDir(); try { JobConf conf = new JobConf(); Path crawlDirPath = new Path(crawlDirName); FileSystem fs = crawlDirPath.getFileSystem(conf); if (!fs.exists(crawlDirPath)) { System.err.println("Prior crawl output directory does not exist: " + crawlDirName); System.exit(-1); } // Skip Hadoop/Cascading DEBUG messages. Logger.getRootLogger().setLevel(Level.INFO); boolean exportDb = options.isExportDb(); if (exportDb) { Path latestCrawlDirPath = CrawlDirUtils.findLatestLoopDir(fs, crawlDirPath); processCrawlDb(conf, latestCrawlDirPath, exportDb); } else { int prevLoop = -1; Path curDirPath = null; while ((curDirPath = CrawlDirUtils.findNextLoopDir(fs, crawlDirPath, prevLoop)) != null) { String curDirName = curDirPath.toUri().toString(); LOGGER.info(""); LOGGER.info("================================================================"); LOGGER.info("Processing " + curDirName); LOGGER.info("================================================================"); int curLoop = CrawlDirUtils.extractLoopNumber(curDirPath); if (curLoop != prevLoop + 1) { LOGGER.warn(String.format("Missing directories between %d and %d", prevLoop, curLoop)); } prevLoop = curLoop; // Process the status and crawldb in curPath processStatus(conf, curDirPath); processCrawlDb(conf, curDirPath, exportDb); } } } catch (Throwable t) { LOGGER.error("Exception running tool", t); System.exit(-1); } }
From source file:bixo.examples.JDBCCrawlTool.java
License:Open Source License
public static void main(String[] args) { JDBCCrawlToolOptions options = new JDBCCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/* ww w. j a v a 2s . c o m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain.startsWith("http")) { System.err.println( "The target domain should be specified as just the host, without the http protocol: " + domain); printUsageAndExit(parser); } if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) { System.err.println( "The target domain should be a valid paid-level domain or subdomain of the same: " + domain); printUsageAndExit(parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user is starting from scratch if (options.getDbLocation() == null) { if (fs.exists(outputPath)) { System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName); System.out.println("Warning: Delete the output dir before running"); fs.delete(outputPath, true); } } else { Path dbLocationPath = new Path(options.getDbLocation()); if (!fs.exists(dbLocationPath)) { fs.mkdirs(dbLocationPath); } } if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf); } Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (inputPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } int startLoop = CrawlDirUtils.extractLoopNumber(inputPath); int endLoop = startLoop + options.getNumLoops(); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; BaseUrlFilter urlFilter = new DomainUrlFilter(domain); // Now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter, options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation()); flow.complete(); // flow.writeDOT("build/valid-flow.dot"); // Input for the next round is our current output inputPath = curLoopDir; } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } JDBCTapFactory.shutdown(); }
From source file:bixo.examples.SimpleCrawlTool.java
License:Open Source License
public static void main(String[] args) { SimpleCrawlToolOptions options = new SimpleCrawlToolOptions(); CmdLineParser parser = new CmdLineParser(options); try {/* w w w .ja v a 2s . c o m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); if (domain.startsWith("http")) { System.err.println( "The target domain should be specified as just the host, without the http protocol: " + domain); printUsageAndExit(parser); } if (!domain.equals("localhost") && (domain.split("\\.").length < 2)) { System.err.println( "The target domain should be a valid paid-level domain or subdomain of the same: " + domain); printUsageAndExit(parser); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /urls subdir // In the /urls dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.toUri().toString(); setLoopLoggerFile(curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); importOneDomain(domain, crawlDbPath, conf); } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy = new FetcherPolicy(); defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT); // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != SimpleCrawlToolOptions.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = new DomainUrlFilter(domain); // OK, now we're ready to start looping, since we've got our current settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.toUri().toString(); setLoopLoggerFile(curLoopDirName, curLoop); Flow flow = SimpleCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. // flow.writeDOT("build/valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("build/failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:bixo.examples.SimpleStatusTool.java
License:Open Source License
private static void processStatus(JobConf conf, Path curDirPath) throws IOException { Path statusPath = new Path(curDirPath, CrawlConfig.STATUS_SUBDIR_NAME); Tap statusTap = new Hfs(new TextLine(), statusPath.toUri().toString()); TupleEntryIterator iter = statusTap.openForRead(conf); UrlStatus[] statusValues = UrlStatus.values(); int[] statusCounts = new int[statusValues.length]; int totalEntries = 0; while (iter.hasNext()) { TupleEntry entry = iter.next();//from w w w .j a v a 2s .c om totalEntries += 1; // STATUS_FN, HEADERS_FN, EXCEPTION_FN, STATUS_TIME_FN, HOST_ADDRESS_FN).append(getSuperFields(StatusDatum.class) String statusLine = entry.getString("line"); String[] pieces = statusLine.split("\t"); UrlStatus status = UrlStatus.valueOf(pieces[0]); statusCounts[status.ordinal()] += 1; } for (int i = 0; i < statusCounts.length; i++) { if (statusCounts[i] != 0) { LOGGER.info(String.format("Status %s: %d", statusValues[i].toString(), statusCounts[i])); } } LOGGER.info("Total status: " + totalEntries); LOGGER.info(""); }
From source file:CalculateSentiment.WordCount.java
License:Apache License
public static void main(String[] args) throws Exception { Path tempDir = new Path("wordcount-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 3) { System.err.println("Usage: wordcount <in> <out> <category>"); System.exit(2);//from w w w .ja va2s . com } conf.set("category", otherArgs[2]); // try { // String filePath = otherArgs[0]; // BufferedReader br = new BufferedReader(new FileReader(filePath)); // String line = br.readLine(); // conf.set("category", line); // } catch (Exception e) { // e.printStackTrace(); // } // conf.set("category", WordCount.read(otherArgs[2])); DistributedCache.createSymlink(conf); String path = "CalculateSentiment.obj"; Path filePath = new Path(path); String uriWithLink = filePath.toUri().toString() + "#" + "object"; DistributedCache.addCacheFile(new URI(uriWithLink), conf); // DistributedCache.addCacheFile(new URI("/CalculateSentiment.obj"), conf); Job job = new Job(conf, "Test"); job.setJarByClass(WordCount.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(DoubleSumReducer.class); job.setReducerClass(DoubleSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setNumReduceTasks(1); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); FileOutputFormat.setOutputPath(job, new Path(otherArgs[1])); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:cascading.flow.hadoop.util.HadoopUtil.java
License:Open Source License
public static void resolvePaths(Configuration config, Collection<String> classpath, String remoteRoot, String resourceSubPath, Map<String, Path> localPaths, Map<String, Path> remotePaths) { FileSystem defaultFS = getDefaultFS(config); FileSystem localFS = getLocalFS(config); Path remoteRootPath = new Path(remoteRoot == null ? "./.staging" : remoteRoot); if (resourceSubPath != null) remoteRootPath = new Path(remoteRootPath, resourceSubPath); remoteRootPath = defaultFS.makeQualified(remoteRootPath); boolean defaultIsLocal = defaultFS.equals(localFS); for (String stringPath : classpath) { Path path = new Path(stringPath); URI uri = path.toUri(); if (uri.getScheme() == null && !defaultIsLocal) // we want to sync {/*from www.j a va2 s . c om*/ Path localPath = localFS.makeQualified(path); if (!exists(localFS, localPath)) throw new FlowException("path not found: " + localPath); String name = localPath.getName(); if (resourceSubPath != null) name = resourceSubPath + "/" + name; localPaths.put(name, localPath); remotePaths.put(name, defaultFS.makeQualified(new Path(remoteRootPath, path.getName()))); } else if (localFS.equals(getFileSystem(config, path))) { if (!exists(localFS, path)) throw new FlowException("path not found: " + path); Path localPath = localFS.makeQualified(path); String name = localPath.getName(); if (resourceSubPath != null) name = resourceSubPath + "/" + name; localPaths.put(name, localPath); } else { if (!exists(defaultFS, path)) throw new FlowException("path not found: " + path); Path defaultPath = defaultFS.makeQualified(path); String name = defaultPath.getName(); if (resourceSubPath != null) name = resourceSubPath + "/" + name; remotePaths.put(name, defaultPath); } } }
From source file:cascading.flow.tez.planner.Hadoop2TezFlowStepJob.java
License:Open Source License
private Path prepareEnsureStagingDir(TezConfiguration workingConf) throws IOException { String stepStagingPath = createStepStagingPath(); workingConf.set(TezConfiguration.TEZ_AM_STAGING_DIR, stepStagingPath); Path stagingDir = new Path(stepStagingPath); FileSystem fileSystem = FileSystem.get(workingConf); stagingDir = fileSystem.makeQualified(stagingDir); TokenCache.obtainTokensForNamenodes(new Credentials(), new Path[] { stagingDir }, workingConf); TezClientUtils.ensureStagingDirExists(workingConf, stagingDir); if (fileSystem.getScheme().startsWith("file:/")) new File(stagingDir.toUri()).mkdirs(); return stagingDir; }
From source file:cascading.tap.hadoop.BaseDistCacheTap.java
License:Open Source License
private void registerURI(Configuration conf, Path path) { URI uri = path.toUri(); LOG.info("adding {} to local resource configuration ", uri); addLocalCacheFiles(conf, uri);//from w w w . java2 s.c o m }
From source file:cascading.tap.hadoop.Hadoop18TapUtil.java
License:Open Source License
private static Path getFinalPath(Path jobOutputDir, Path taskOutput, Path taskOutputPath) throws IOException { URI taskOutputUri = taskOutput.toUri(); URI relativePath = taskOutputPath.toUri().relativize(taskOutputUri); if (taskOutputUri == relativePath) {//taskOutputPath is not a parent of taskOutput throw new IOException( "Can not get the relative path: base = " + taskOutputPath + " child = " + taskOutput); }//from ww w . j a v a2 s. co m if (relativePath.getPath().length() > 0) { return new Path(jobOutputDir, relativePath.getPath()); } else { return jobOutputDir; } }
From source file:cascading.tap.hadoop.Hfs.java
License:Open Source License
private void makeLocal(Configuration conf, Path qualifiedPath, String infoMessage) { String scheme = getLocalModeScheme(conf, "file"); if (!HadoopUtil.isLocal(conf) && qualifiedPath.toUri().getScheme().equalsIgnoreCase(scheme)) { if (LOG.isInfoEnabled()) LOG.info(infoMessage + toString()); HadoopUtil.setLocal(conf); // force job to run locally }// ww w. j ava 2 s .com }