List of usage examples for org.apache.hadoop.fs FileSystem getContentSummary
public ContentSummary getContentSummary(Path f) throws IOException
From source file:com.tripadvisor.hadoop.BackupHdfs.java
License:Apache License
/** * Method to go though the HDFS filesystem in a DFS to find all * files/*from w w w.j a v a2s.c o m*/ * * fs:FileSystem object from HDFS * minDate: Oldest date for files to be backed up * maxDate:Newest date for files to be backed up * p:Path in HDFS to look for files * pathList:Will be filled with all files in p * hmTimestamps: hashmap of timestamps for later sorting **/ public void checkDir(FileSystem fs, long minDate, long maxDate, Path p, ArrayList<Path> pathList, HashMap<Path, Long> hmTimestamps) { long tmpDate; FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } // dump the mkdir and chmod commands for this // directory -- skip root directory only { FileStatus stat = fs.getFileStatus(p); if (!sPath.equals("/")) { m_wrMkdirs.println("hadoop fs -mkdir " + sPath); } m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); Short sh = new Short(stat.getPermission().toShort()); m_wrChmods.println("hadoop fs -chmod " + Long.toOctalString(sh.longValue()) + " " + sPath); } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, minDate, maxDate, fStat[i].getPath(), pathList, hmTimestamps); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // another database to regular hive tables to // partitioned hive tables. We use table names to // both exclude some from the backup, and for the rest // to dump out the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { m_nIgnoredTables++; if (m_nIgnoredTables < 5) { System.out.println("Skipping ignore-table file: " + sPath); } else if (m_nIgnoredTables == 5) { System.out.println("(...not showing other skipped tables...)"); } return; } FileStatus stat = fs.getFileStatus(p); tmpDate = stat.getModificationTime() / 1000; // store the chmods/chowns for all files m_wrChmods.println("hadoop fs -chown " + stat.getOwner() + ":" + stat.getGroup() + " " + sPath); m_wrChmods.println("hadoop fs -chmod " + stat.getPermission().toShort() + " " + sPath); // check dates. is it too young? if (tmpDate < minDate) { return; } // is the file too recent? if (tmpDate > maxDate) { //System.out.println("file too recent: " + sPath); return; } // file timestamp is ok pathList.add(p); hmTimestamps.put(p, new Long(tmpDate)); // store info about total bytes neeed to backup m_nTotalBytes += fs.getContentSummary(p).getLength(); } } catch (IOException e) { System.err.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }
From source file:com.twitter.hraven.etl.JobFilePreprocessor.java
License:Apache License
@Override public int run(String[] args) throws Exception { // When we started processing. This is also the upper limit of files we // accept, next run will pick up the new incoming files. long processingStartMillis = System.currentTimeMillis(); Configuration hbaseConf = HBaseConfiguration.create(getConf()); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(hbaseConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Output should be an hdfs path. FileSystem hdfs = FileSystem.get(hbaseConf); // Grab the input path argument String output = commandLine.getOptionValue("o"); LOG.info(" output=" + output); Path outputPath = new Path(output); FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); if (!outputFileStatus.isDir()) { throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName()); }//w w w . j av a 2 s .c o m // Grab the input path argument String input; if (commandLine.hasOption("i")) { input = commandLine.getOptionValue("i"); } else { input = hbaseConf.get("mapred.job.tracker.history.completed.location"); } LOG.info("input=" + input); // Grab the batch-size argument int batchSize; if (commandLine.hasOption("b")) { try { batchSize = Integer.parseInt(commandLine.getOptionValue("b")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "batch size option -b is is not a valid number: " + commandLine.getOptionValue("b"), nfe); } // Additional check if (batchSize < 1) { throw new IllegalArgumentException( "Cannot process files in batches smaller than 1. Specified batch size option -b is: " + commandLine.getOptionValue("b")); } } else { batchSize = DEFAULT_BATCH_SIZE; } boolean forceAllFiles = commandLine.hasOption("f"); LOG.info("forceAllFiles: " + forceAllFiles); Path inputPath = new Path(input); FileStatus inputFileStatus = hdfs.getFileStatus(inputPath); if (!inputFileStatus.isDir()) { throw new IOException("Input is not a directory" + inputFileStatus.getPath().getName()); } // Grab the cluster argument String cluster = commandLine.getOptionValue("c"); LOG.info("cluster=" + cluster); /** * Grab the size of huge files to be moved argument * hbase cell can't store files bigger than * maxFileSize, hence no need to consider them for rawloading * Reference: * {@link https://github.com/twitter/hraven/issues/59} */ String maxFileSizeStr = commandLine.getOptionValue("s"); LOG.info("maxFileSize=" + maxFileSizeStr); long maxFileSize = DEFAULT_RAW_FILE_SIZE_LIMIT; try { maxFileSize = Long.parseLong(maxFileSizeStr); } catch (NumberFormatException nfe) { throw new ProcessingException( "Caught NumberFormatException during conversion " + " of maxFileSize to long", nfe); } ProcessRecordService processRecordService = new ProcessRecordService(hbaseConf); boolean success = true; try { // Figure out where we last left off (if anywhere at all) ProcessRecord lastProcessRecord = null; if (!forceAllFiles) { lastProcessRecord = processRecordService.getLastSuccessfulProcessRecord(cluster); } long minModificationTimeMillis = 0; if (lastProcessRecord != null) { // Start of this time period is the end of the last period. minModificationTimeMillis = lastProcessRecord.getMaxModificationTimeMillis(); } // Do a sanity check. The end time of the last scan better not be later // than when we started processing. if (minModificationTimeMillis > processingStartMillis) { throw new RuntimeException("The last processing record has maxModificationMillis later than now: " + lastProcessRecord); } // Accept only jobFiles and only those that fall in the desired range of // modification time. JobFileModifiedRangePathFilter jobFileModifiedRangePathFilter = new JobFileModifiedRangePathFilter( hbaseConf, minModificationTimeMillis); String timestamp = Constants.TIMESTAMP_FORMAT.format(new Date(minModificationTimeMillis)); ContentSummary contentSummary = hdfs.getContentSummary(inputPath); LOG.info("Listing / filtering (" + contentSummary.getFileCount() + ") files in: " + inputPath + " that are modified since " + timestamp); // get the files in the done folder, // need to traverse dirs under done recursively for versions // that include MAPREDUCE-323: on/after hadoop 0.20.203.0 // on/after cdh3u5 FileStatus[] jobFileStatusses = FileLister.getListFilesToProcess(maxFileSize, true, hdfs, inputPath, jobFileModifiedRangePathFilter); LOG.info("Sorting " + jobFileStatusses.length + " job files."); Arrays.sort(jobFileStatusses, new FileStatusModificationComparator()); // Process these files in batches at a time. int batchCount = BatchUtil.getBatchCount(jobFileStatusses.length, batchSize); LOG.info("Batch count: " + batchCount); for (int b = 0; b < batchCount; b++) { processBatch(jobFileStatusses, b, batchSize, processRecordService, cluster, outputPath); } } finally { processRecordService.close(); } Statistics statistics = FileSystem.getStatistics(inputPath.toUri().getScheme(), hdfs.getClass()); if (statistics != null) { LOG.info("HDFS bytes read: " + statistics.getBytesRead()); LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); LOG.info("HDFS read ops: " + statistics.getReadOps()); LOG.info("HDFS large read ops: " + statistics.getLargeReadOps()); LOG.info("HDFS write ops: " + statistics.getWriteOps()); } // Return the status return success ? 0 : 1; }
From source file:org.apache.accumulo.monitor.servlets.DefaultServlet.java
License:Apache License
private void doAccumuloTable(StringBuilder sb) throws IOException { // Accumulo/*from w w w.j a va 2s . co m*/ VolumeManager vm = VolumeManagerImpl.get(ServerConfiguration.getSiteConfiguration()); MasterMonitorInfo info = Monitor.getMmi(); sb.append("<table>\n"); sb.append("<tr><th colspan='2'><a href='/master'>Accumulo Master</a></th></tr>\n"); if (info == null) { sb.append("<tr><td colspan='2'><span class='error'>Master is Down</span></td></tr>\n"); } else { long totalAcuBytesUsed = 0l; long totalHdfsBytesUsed = 0l; try { for (String baseDir : VolumeConfiguration .getVolumeUris(ServerConfiguration.getSiteConfiguration())) { final Path basePath = new Path(baseDir); final FileSystem fs = vm.getVolumeByPath(basePath).getFileSystem(); try { // Calculate the amount of space used by Accumulo on the FileSystem ContentSummary accumuloSummary = fs.getContentSummary(basePath); long bytesUsedByAcuOnFs = accumuloSummary.getSpaceConsumed(); totalAcuBytesUsed += bytesUsedByAcuOnFs; // Catch the overflow -- this is big data if (totalAcuBytesUsed < bytesUsedByAcuOnFs) { log.debug("Overflowed long in bytes used by Accumulo for " + baseDir); totalAcuBytesUsed = 0l; break; } // Calculate the total amount of space used on the FileSystem ContentSummary volumeSummary = fs.getContentSummary(new Path("/")); long bytesUsedOnVolume = volumeSummary.getSpaceConsumed(); totalHdfsBytesUsed += bytesUsedOnVolume; // Catch the overflow -- this is big data if (totalHdfsBytesUsed < bytesUsedOnVolume) { log.debug("Overflowed long in bytes used in HDFS for " + baseDir); totalHdfsBytesUsed = 0; break; } } catch (Exception ex) { log.trace("Unable to get disk usage information for " + baseDir, ex); } } String diskUsed = "Unknown"; String consumed = null; if (totalAcuBytesUsed > 0) { // Convert Accumulo usage to a readable String diskUsed = bytes(totalAcuBytesUsed); if (totalHdfsBytesUsed > 0) { // Compute amount of space used by Accumulo as a percentage of total space usage. consumed = String.format("%.2f%%", totalAcuBytesUsed * 100. / totalHdfsBytesUsed); } } boolean highlight = false; tableRow(sb, (highlight = !highlight), "Disk Used", diskUsed); if (null != consumed) tableRow(sb, (highlight = !highlight), "% of Used DFS", consumed); tableRow(sb, (highlight = !highlight), "<a href='/tables'>Tables</a>", NumberType.commas(Monitor.getTotalTables())); tableRow(sb, (highlight = !highlight), "<a href='/tservers'>Tablet Servers</a>", NumberType.commas(info.tServerInfo.size(), 1, Long.MAX_VALUE)); tableRow(sb, (highlight = !highlight), "<a href='/tservers'>Dead Tablet Servers</a>", NumberType.commas(info.deadTabletServers.size(), 0, 0)); tableRow(sb, (highlight = !highlight), "Tablets", NumberType.commas(Monitor.getTotalTabletCount(), 1, Long.MAX_VALUE)); tableRow(sb, (highlight = !highlight), "Entries", NumberType.commas(Monitor.getTotalEntries())); tableRow(sb, (highlight = !highlight), "Lookups", NumberType.commas(Monitor.getTotalLookups())); tableRow(sb, (highlight = !highlight), "Uptime", Duration.format(System.currentTimeMillis() - Monitor.getStartTime())); } catch (Exception e) { log.debug(e, e); } } sb.append("</table>\n"); }
From source file:org.apache.accumulo.server.client.BulkImporter.java
License:Apache License
private Map<Path, List<AssignmentInfo>> estimateSizes(final AccumuloConfiguration acuConf, final Configuration conf, final VolumeManager vm, Map<Path, List<TabletLocation>> assignments, Collection<Path> paths, int numThreads) { long t1 = System.currentTimeMillis(); final Map<Path, Long> mapFileSizes = new TreeMap<>(); try {//w w w . ja v a2 s . com for (Path path : paths) { FileSystem fs = vm.getVolumeByPath(path).getFileSystem(); mapFileSizes.put(path, fs.getContentSummary(path).getLength()); } } catch (IOException e) { log.error("Failed to get map files in for {}: {}", paths, e.getMessage(), e); throw new RuntimeException(e); } final Map<Path, List<AssignmentInfo>> ais = Collections .synchronizedMap(new TreeMap<Path, List<AssignmentInfo>>()); ExecutorService threadPool = Executors.newFixedThreadPool(numThreads, new NamingThreadFactory("estimateSizes")); for (final Entry<Path, List<TabletLocation>> entry : assignments.entrySet()) { if (entry.getValue().size() == 1) { TabletLocation tabletLocation = entry.getValue().get(0); // if the tablet completely contains the map file, there is no // need to estimate its // size ais.put(entry.getKey(), Collections.singletonList( new AssignmentInfo(tabletLocation.tablet_extent, mapFileSizes.get(entry.getKey())))); continue; } Runnable estimationTask = new Runnable() { @Override public void run() { Map<KeyExtent, Long> estimatedSizes = null; try { estimatedSizes = FileUtil.estimateSizes(acuConf, entry.getKey(), mapFileSizes.get(entry.getKey()), extentsOf(entry.getValue()), conf, vm); } catch (IOException e) { log.warn("Failed to estimate map file sizes {}", e.getMessage()); } if (estimatedSizes == null) { // estimation failed, do a simple estimation estimatedSizes = new TreeMap<>(); long estSize = (long) (mapFileSizes.get(entry.getKey()) / (double) entry.getValue().size()); for (TabletLocation tl : entry.getValue()) estimatedSizes.put(tl.tablet_extent, estSize); } List<AssignmentInfo> assignmentInfoList = new ArrayList<>(estimatedSizes.size()); for (Entry<KeyExtent, Long> entry2 : estimatedSizes.entrySet()) assignmentInfoList.add(new AssignmentInfo(entry2.getKey(), entry2.getValue())); ais.put(entry.getKey(), assignmentInfoList); } }; threadPool.submit(new TraceRunnable(new LoggingRunnable(log, estimationTask))); } threadPool.shutdown(); while (!threadPool.isTerminated()) { try { threadPool.awaitTermination(60, TimeUnit.SECONDS); } catch (InterruptedException e) { log.error("Encountered InterruptedException while waiting for the threadPool to terminate.", e); throw new RuntimeException(e); } } long t2 = System.currentTimeMillis(); log.debug(String.format("Estimated map files sizes in %6.2f secs", (t2 - t1) / 1000.0)); return ais; }
From source file:org.apache.accumulo.server.monitor.servlets.DefaultServlet.java
License:Apache License
private void doAccumuloTable(StringBuilder sb) throws IOException { // Accumulo/*from w w w . ja v a 2 s . com*/ Configuration conf = CachedConfiguration.getInstance(); FileSystem fs = TraceFileSystem .wrap(FileUtil.getFileSystem(conf, ServerConfiguration.getSiteConfiguration())); MasterMonitorInfo info = Monitor.getMmi(); sb.append("<table>\n"); sb.append("<tr><th colspan='2'><a href='/master'>Accumulo Master</a></th></tr>\n"); if (info == null) { sb.append("<tr><td colspan='2'><span class='error'>Master is Down</span></td></tr>\n"); } else { String consumed = "Unknown"; String diskUsed = "Unknown"; try { Path path = new Path(Monitor.getSystemConfiguration().get(Property.INSTANCE_DFS_DIR)); log.debug("Reading the content summary for " + path); try { ContentSummary acu = fs.getContentSummary(path); ContentSummary rootSummary = fs.getContentSummary(new Path("/")); consumed = String.format("%.2f%%", acu.getSpaceConsumed() * 100. / rootSummary.getSpaceConsumed()); diskUsed = bytes(acu.getSpaceConsumed()); } catch (Exception ex) { log.trace("Unable to get disk usage information from hdfs", ex); } boolean highlight = false; tableRow(sb, (highlight = !highlight), "Disk Used", diskUsed); if (fs.getUsed() != 0) tableRow(sb, (highlight = !highlight), "% of Used DFS", consumed); tableRow(sb, (highlight = !highlight), "<a href='/tables'>Tables</a>", NumberType.commas(Monitor.getTotalTables())); tableRow(sb, (highlight = !highlight), "<a href='/tservers'>Tablet Servers</a>", NumberType.commas(info.tServerInfo.size(), 1, Long.MAX_VALUE)); tableRow(sb, (highlight = !highlight), "<a href='/tservers'>Dead Tablet Servers</a>", NumberType.commas(info.deadTabletServers.size(), 0, 0)); tableRow(sb, (highlight = !highlight), "Tablets", NumberType.commas(Monitor.getTotalTabletCount(), 1, Long.MAX_VALUE)); tableRow(sb, (highlight = !highlight), "Entries", NumberType.commas(Monitor.getTotalEntries())); tableRow(sb, (highlight = !highlight), "Lookups", NumberType.commas(Monitor.getTotalLookups())); tableRow(sb, (highlight = !highlight), "Uptime", Duration.format(System.currentTimeMillis() - Monitor.getStartTime())); } catch (Exception e) { log.debug(e, e); } } sb.append("</table>\n"); }
From source file:org.apache.carbondata.core.datastorage.store.impl.FileFactory.java
License:Apache License
/** * It computes size of directory//w ww. ja v a 2 s. c o m * * @param filePath * @return size in bytes * @throws IOException */ public static long getDirectorySize(String filePath) throws IOException { FileType fileType = getFileType(filePath); switch (fileType) { case HDFS: case VIEWFS: Path path = new Path(filePath); FileSystem fs = path.getFileSystem(configuration); return fs.getContentSummary(path).getLength(); case LOCAL: default: File file = new File(filePath); return FileUtils.sizeOfDirectory(file); } }
From source file:org.apache.carbondata.core.datastore.impl.FileFactory.java
License:Apache License
/** * It computes size of directory/*from w ww .j a va 2 s . c om*/ * * @param filePath * @return size in bytes * @throws IOException */ public static long getDirectorySize(String filePath) throws IOException { FileType fileType = getFileType(filePath); switch (fileType) { case HDFS: case ALLUXIO: case VIEWFS: Path path = new Path(filePath); FileSystem fs = path.getFileSystem(configuration); return fs.getContentSummary(path).getLength(); case LOCAL: default: filePath = getUpdatedFilePath(filePath, fileType); File file = new File(filePath); return FileUtils.sizeOfDirectory(file); } }
From source file:org.apache.falcon.entity.FileSystemStorage.java
License:Apache License
@Override @SuppressWarnings("MagicConstant") public List<FeedInstanceStatus> getListing(Feed feed, String clusterName, LocationType locationType, Date start, Date end) throws FalconException { Calendar calendar = Calendar.getInstance(); List<Location> clusterSpecificLocation = FeedHelper.getLocations(FeedHelper.getCluster(feed, clusterName), feed);// w w w.j a va2 s . c o m Location location = getLocation(clusterSpecificLocation, locationType); try { FileSystem fileSystem = HadoopClientFactory.get().createProxiedFileSystem(getConf()); Cluster cluster = ClusterHelper.getCluster(clusterName); Properties baseProperties = FeedHelper.getClusterProperties(cluster); baseProperties.putAll(FeedHelper.getFeedProperties(feed)); List<FeedInstanceStatus> instances = new ArrayList<FeedInstanceStatus>(); Date feedStart = FeedHelper.getCluster(feed, clusterName).getValidity().getStart(); TimeZone tz = feed.getTimezone(); Date alignedStart = EntityUtil.getNextStartTime(feedStart, feed.getFrequency(), tz, start); String basePath = location.getPath(); while (!end.before(alignedStart)) { Properties allProperties = ExpressionHelper.getTimeVariables(alignedStart, tz); allProperties.putAll(baseProperties); String feedInstancePath = ExpressionHelper.substitute(basePath, allProperties); FileStatus fileStatus = getFileStatus(fileSystem, new Path(feedInstancePath)); FeedInstanceStatus instance = new FeedInstanceStatus(feedInstancePath); Date date = FeedHelper.getDate(basePath, new Path(feedInstancePath), tz); instance.setInstance(SchemaHelper.formatDateUTC(date)); if (fileStatus != null) { instance.setCreationTime(fileStatus.getModificationTime()); ContentSummary contentSummary = fileSystem.getContentSummary(fileStatus.getPath()); if (contentSummary != null) { long size = contentSummary.getSpaceConsumed(); instance.setSize(size); if (!StringUtils.isEmpty(feed.getAvailabilityFlag())) { FileStatus doneFile = getFileStatus(fileSystem, new Path(fileStatus.getPath(), feed.getAvailabilityFlag())); if (doneFile != null) { instance.setStatus(FeedInstanceStatus.AvailabilityStatus.AVAILABLE); } else { instance.setStatus(FeedInstanceStatus.AvailabilityStatus.PARTIAL); } } else { instance.setStatus(size > 0 ? FeedInstanceStatus.AvailabilityStatus.AVAILABLE : FeedInstanceStatus.AvailabilityStatus.EMPTY); } } } instances.add(instance); calendar.setTime(alignedStart); calendar.add(feed.getFrequency().getTimeUnit().getCalendarUnit(), feed.getFrequency().getFrequencyAsInt()); alignedStart = calendar.getTime(); } return instances; } catch (IOException e) { LOG.error("Unable to retrieve listing for {}:{}", locationType, getStorageUrl(), e); throw new FalconException("Unable to retrieve listing for (URI " + getStorageUrl() + ")", e); } }
From source file:org.apache.falcon.latedata.LateDataHandler.java
License:Apache License
private long usage(Path inPath, Configuration conf) throws IOException, FalconException { FileSystem fs = HadoopClientFactory.get().createFileSystem(inPath.toUri(), conf); FileStatus[] fileStatuses = fs.globStatus(inPath); if (fileStatuses == null || fileStatuses.length == 0) { return 0; }/*from w w w. jav a2 s . co m*/ long totalSize = 0; for (FileStatus fileStatus : fileStatuses) { totalSize += fs.getContentSummary(fileStatus.getPath()).getLength(); } return totalSize; }
From source file:org.apache.falcon.regression.core.util.AssertUtil.java
License:Apache License
/** * Checks size of the content a two locations. * * @param firstPath path to the first location * @param secondPath path to the second location * @param fs hadoop file system for the locations * @throws IOException//from w ww . ja va 2 s. com */ public static void checkContentSize(String firstPath, String secondPath, FileSystem fs) throws IOException { final ContentSummary firstSummary = fs.getContentSummary(new Path(firstPath)); final ContentSummary secondSummary = fs.getContentSummary(new Path(secondPath)); LOGGER.info(firstPath + " : firstSummary = " + firstSummary.toString(false)); LOGGER.info(secondPath + " : secondSummary = " + secondSummary.toString(false)); Assert.assertEquals(firstSummary.getLength(), secondSummary.getLength(), "Contents at the two locations don't have same size."); }