List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.facebook.presto.hive.s3.PrestoS3FileSystem.java
License:Apache License
@Override public boolean rename(Path src, Path dst) throws IOException { boolean srcDirectory; try {/*from w w w . j a v a 2 s . c o m*/ srcDirectory = directory(src); } catch (FileNotFoundException e) { return false; } try { if (!directory(dst)) { // cannot copy a file to an existing file return keysEqual(src, dst); } // move source under destination directory dst = new Path(dst, src.getName()); } catch (FileNotFoundException e) { // destination does not exist } if (keysEqual(src, dst)) { return true; } if (srcDirectory) { for (FileStatus file : listStatus(src)) { rename(file.getPath(), new Path(dst, file.getPath().getName())); } deleteObject(keyFromPath(src) + DIRECTORY_SUFFIX); } else { s3.copyObject(getBucketName(uri), keyFromPath(src), getBucketName(uri), keyFromPath(dst)); delete(src, true); } return true; }
From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerTool.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); CmdLineParser parser = new CmdLineParser(options); try {/*from w w w.ja va2s .com*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user wants to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy; if (options.getCrawlDuration() != 0) { defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay()); } else { defaultPolicy = new FetcherPolicy(); } defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds // COMPLETE for crawling a single site, EFFICIENT for many sites if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); } // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()])); // get a list of patterns which tell the miner which URLs to include or exclude. patterns.clear(); RegexUrlStringFilter urlsToMineFilter = null; String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile(); AnalyzeHtml analyzer = null; if (regexUrlsToMineFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile); urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()])); analyzer = new AnalyzeHtml(urlsToMineFilter); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = PinterestCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, analyzer, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. flow.writeDOT("valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerTool.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); CmdLineParser parser = new CmdLineParser(options); try {/*from w ww . ja v a2s . co m*/ parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Before we get too far along, see if the domain looks valid. String domain = options.getDomain(); String urlsFile = options.getUrlsFile(); if (domain != null) { validateDomain(domain, parser); } else { if (urlsFile == null) { System.err.println( "Either a target domain should be specified or a file with a list of urls needs to be provided"); printUsageAndExit(parser); } } if (domain != null && urlsFile != null) { System.out.println("Warning: Both domain and urls file list provided - using domain"); } String outputDirName = options.getOutputDir(); if (options.isDebugLogging()) { System.setProperty("bixo.root.level", "DEBUG"); } else { System.setProperty("bixo.root.level", "INFO"); } if (options.getLoggingAppender() != null) { // Set console vs. DRFA vs. something else System.setProperty("bixo.appender", options.getLoggingAppender()); } String logsDir = options.getLogsDir(); if (!logsDir.endsWith("/")) { logsDir = logsDir + "/"; } try { JobConf conf = new JobConf(); Path outputPath = new Path(outputDirName); FileSystem fs = outputPath.getFileSystem(conf); // First check if the user wants to clean if (options.isCleanOutputDir()) { if (fs.exists(outputPath)) { fs.delete(outputPath, true); } } // See if the user isn't starting from scratch then set up the // output directory and create an initial urls subdir. if (!fs.exists(outputPath)) { fs.mkdirs(outputPath); // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir // In the /crawldb dir the input file will have a single URL for the target domain. Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0); String curLoopDirName = curLoopDir.getName(); setLoopLoggerFile(logsDir + curLoopDirName, 0); Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME); if (domain != null) { importOneDomain(domain, crawlDbPath, conf); } else { importUrls(urlsFile, crawlDbPath); } } Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath); if (latestDirPath == null) { System.err.println("No previous cycle output dirs exist in " + outputDirName); printUsageAndExit(parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); // Set up the start and end loop counts. int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath); int endLoop = startLoop + options.getNumLoops(); // Set up the UserAgent for the fetcher. UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); // You also get to customize the FetcherPolicy FetcherPolicy defaultPolicy; if (options.getCrawlDuration() != 0) { defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay()); } else { defaultPolicy = new FetcherPolicy(); } defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds // COMPLETE for crawling a single site, EFFICIENT for many sites if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE); } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT); } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) { defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE); } // It is a good idea to set up a crawl duration when running long crawls as you may // end up in situations where the fetch slows down due to a 'long tail' and by // specifying a crawl duration you know exactly when the crawl will end. int crawlDurationInMinutes = options.getCrawlDuration(); boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION; long targetEndTime = hasEndTime ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE) : FetcherPolicy.NO_CRAWL_END_TIME; // By setting up a url filter we only deal with urls that we want to // instead of all the urls that we extract. BaseUrlFilter urlFilter = null; List<String> patterns = null; String regexUrlFiltersFile = options.getRegexUrlFiltersFile(); if (regexUrlFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile); } else { patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns(); if (domain != null) { String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain; patterns.add(domainPatterStr); } else { String protocolPatterStr = "+(?i)^(http|https)://*"; patterns.add(protocolPatterStr); //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol"); } } urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()])); // get a list of patterns which tell the miner which URLs to include or exclude. patterns.clear(); RegexUrlStringFilter urlsToMineFilter = null; String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile(); MineRTCriticsPreferences prefsAnalyzer = null; if (regexUrlsToMineFiltersFile != null) { patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile); urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()])); prefsAnalyzer = new MineRTCriticsPreferences(urlsToMineFilter); } // OK, now we're ready to start looping, since we've got our current // settings for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) { // Adjust target end time, if appropriate. if (hasEndTime) { int remainingLoops = (endLoop - curLoop) + 1; long now = System.currentTimeMillis(); long perLoopTime = (targetEndTime - now) / remainingLoops; defaultPolicy.setCrawlEndTime(now + perLoopTime); } Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop); String curLoopDirName = curLoopDirPath.getName(); setLoopLoggerFile(logsDir + curLoopDirName, curLoop); Flow flow = RTCriticsCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent, urlFilter, prefsAnalyzer, options); flow.complete(); // Writing out .dot files is a good way to verify your flows. flow.writeDOT("valid-flow.dot"); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (PlannerException e) { e.writeDOT("failed-flow.dot"); System.err.println("PlannerException: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } catch (Throwable t) { System.err.println("Exception running tool: " + t.getMessage()); t.printStackTrace(System.err); System.exit(-1); } }
From source file:com.fullcontact.cassandra.io.sstable.Descriptor.java
License:Apache License
/** * @param filename The SSTable filename//from w ww . j a v a 2 s . com * @return Descriptor of the SSTable initialized from filename * @see #fromFilename(org.apache.hadoop.fs.Path, String) (File directory, String name) */ public static Descriptor fromFilename(String filename) { Path file = new Path(filename); return fromFilename(file.getParent(), file.getName()).left; }
From source file:com.fullcontact.sstable.hadoop.SSTableFunctions.java
License:Apache License
/** * Return a function which determines the SSTable index file when supplied with the SSTable data file. * @return Function./* w ww . ja v a2 s. c o m*/ */ public static Function<Path, Path> indexFile() { return new Function<Path, Path>() { @Nullable @Override public Path apply(@Nullable Path dataFile) { final String dataFileName = dataFile.getName(); return new Path(dataFile.getParent(), dataFileName.replace("-Data.db", "-Index.db")); } }; }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.AbstractHoplog.java
License:Apache License
AbstractHoplog(FileSystem inputFS, Path filePath, SortedOplogStatistics stats) throws IOException { logPrefix = "<" + filePath.getName() + "> "; this.fsProvider = new FSProvider(inputFS); initialize(filePath, stats, inputFS); }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.AbstractHoplog.java
License:Apache License
public AbstractHoplog(HDFSStoreImpl store, Path filePath, SortedOplogStatistics stats) throws IOException { logPrefix = "<" + filePath.getName() + "> "; this.fsProvider = new FSProvider(store); initialize(filePath, stats, store.getFileSystem()); }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HdfsSortedOplogOrganizer.java
License:Apache License
protected FileStatus[] getExpiryMarkers() throws IOException { FileSystem fs = store.getFileSystem(); if (hoplogReadersController.hoplogs == null || hoplogReadersController.hoplogs.size() == 0) { // there are no hoplogs in the system. May be the bucket is not existing // at all. if (!fs.exists(bucketPath)) { if (logger.isDebugEnabled()) logger.debug("{}This bucket is unused, skipping expired hoplog check", logPrefix); return null; }/* ww w . ja v a2s. c om*/ } FileStatus files[] = FSUtils.listStatus(fs, bucketPath, new PathFilter() { @Override public boolean accept(Path file) { // All expired hoplog end with expire extension and must match the valid file regex String fileName = file.getName(); if (!fileName.endsWith(EXPIRED_HOPLOG_EXTENSION)) { return false; } fileName = truncateExpiryExtension(fileName); Matcher matcher = SORTED_HOPLOG_PATTERN.matcher(fileName); return matcher.find(); } }); return files; }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java
License:Apache License
public HDFSUnsortedHoplogOrganizer(HdfsRegionManager region, int bucketId) throws IOException { super(region, bucketId); writer = null;// w w w. jav a2s. c om sequence = new AtomicInteger(0); fileSystem = store.getFileSystem(); if (!fileSystem.exists(bucketPath)) { return; } FileStatus validHoplogs[] = FSUtils.listStatus(fileSystem, bucketPath, new PathFilter() { @Override public boolean accept(Path file) { // All valid hoplog files must match the regex Matcher matcher = HOPLOG_PATTERN.matcher(file.getName()); return matcher.matches(); } }); if (validHoplogs != null && validHoplogs.length > 0) { for (FileStatus file : validHoplogs) { // account for the disk used by this file incrementDiskUsage(file.getLen()); } } }
From source file:com.gemstone.gemfire.cache.hdfs.internal.hoplog.HDFSUnsortedHoplogOrganizer.java
License:Apache License
/** * Fixes the size of hoplogs that were not closed properly last time. * Such hoplogs are *.tmphop files. Identify them and open them and close * them, this fixes the size. After doing this rename them to *.hop. * //ww w.j av a2 s .c o m * @throws IOException * @throws ForceReattemptException */ void identifyAndFixTmpHoplogs(FileSystem fs) throws IOException, ForceReattemptException { if (logger.isDebugEnabled()) logger.debug("{}Fixing temporary hoplogs", logPrefix); // A different filesystem is passed to this function for the following reason: // For HDFS, if a file wasn't closed properly last time, // while calling FileSystem.append for this file, FSNamesystem.startFileInternal-> // FSNamesystem.recoverLeaseInternal function gets called. // This function throws AlreadyBeingCreatedException if there is an open handle, to any other file, // created using the same FileSystem object. This is a bug and is being tracked at: // https://issues.apache.org/jira/browse/HDFS-3848?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel // // The fix for this bug is not yet part of Pivotal HD. So to overcome the bug, // we create a new file system for the timer task so that it does not encounter the bug. FileStatus tmpHoplogs[] = FSUtils.listStatus(fs, fs.makeQualified(bucketPath), new PathFilter() { @Override public boolean accept(Path file) { // All valid hoplog files must match the regex Matcher matcher = patternForTmpHoplog.matcher(file.getName()); return matcher.matches(); } }); if (tmpHoplogs == null || tmpHoplogs.length == 0) { if (logger.isDebugEnabled()) logger.debug("{}No files to fix", logPrefix); return; } // ping secondaries so that in case of split brain, no other vm has taken up // as primary. #50110. pingSecondaries(); if (logger.isDebugEnabled()) logger.debug("{}Files to fix " + tmpHoplogs.length, logPrefix); String currentHoplogName = null; // get the current hoplog name. We need to ignore current hoplog while fixing. if (currentHoplog != null) { currentHoplogName = currentHoplog.getFileName(); } for (int i = 0; i < tmpHoplogs.length; i++) { // Skip directories if (tmpHoplogs[i].isDirectory()) { continue; } final Path p = tmpHoplogs[i].getPath(); if (tmpHoplogs[i].getPath().getName().equals(currentHoplogName)) { if (logger.isDebugEnabled()) logger.debug("Skipping current file: " + tmpHoplogs[i].getPath().getName(), logPrefix); continue; } SequenceFileHoplog hoplog = new SequenceFileHoplog(fs, p, stats); try { makeLegitimate(hoplog); logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " was a temporary " + "hoplog because the node managing it wasn't shutdown properly last time. Fixed the hoplog name.")); } catch (IOException e) { logger.info(LocalizedMessage.create(LocalizedStrings.DEBUG, "Hoplog " + p + " is still a temporary " + "hoplog because the node managing it wasn't shutdown properly last time. Failed to " + "change the hoplog name because an exception was thrown while fixing it. " + e)); } } }