List of usage examples for org.apache.hadoop.fs FileSystem globStatus
public FileStatus[] globStatus(Path pathPattern) throws IOException
Return all the files that match filePattern and are not checksum files.
From source file:com.twitter.elephanttwin.retrieval.IndexedPigLoader.java
License:Apache License
private String[] getUnionedPartitionKeys(String location, Job job) throws IOException { /**/* w w w . j a v a 2 s .c om*/ * report what columns have been indexed before. The current implementation * only reports the columns indexed on all input files the PigLoader need to * work on. This is done by inspecting the FileIndexDesriptor of each input * file */ if (location == null || location.equals("")) return null; Configuration conf = job.getConfiguration(); FileSystem fs = FileSystem.get(conf); if (!fs.exists(new Path(indexDir))) { LOG.info("index dir:" + indexDir + " does not exist, no indexes will be used"); return null; } LOG.info("checking directory:" + new Path(indexDir + new Path(location).toUri().getPath())); FileStatus[] fileStatues = fs.globStatus(new Path(indexDir + new Path(location).toUri().getPath())); if (fileStatues == null || fileStatues.length == 0) { LOG.info("index dir:" + indexDir + location + " does not have indexes, no indexes will be used"); return null; } // return all indexed column names from all base file under location which have been previously indexed. HashSet<String> indexedColumns = new HashSet<String>(); List<FileStatus> indexMetaFiles = new ArrayList<FileStatus>(); for (FileStatus status : fileStatues) { HdfsUtils.addInputPathRecursively(indexMetaFiles, fs, status.getPath(), HdfsUtils.hiddenDirectoryFilter, indexMetaPathFilter); } LOG.info("found " + indexMetaFiles.size() + " index descriptor files"); for (FileStatus indexMetafile : indexMetaFiles) { FSDataInputStream in = fs.open(indexMetafile.getPath()); ThriftWritable<FileIndexDescriptor> writable = ThriftWritable.newInstance(FileIndexDescriptor.class); writable.readFields(in); FileIndexDescriptor indexDescriptor = writable.get(); List<IndexedField> indexedFields = indexDescriptor.getIndexedFields(); in.close(); for (IndexedField field : indexedFields) { String colName = field.getFieldName(); indexedColumns.add(colName); } } if (indexedColumns.size() == 0) { return null; } return indexedColumns.toArray(new String[indexedColumns.size()]); }
From source file:com.twitter.elephanttwin.util.HdfsUtils.java
License:Apache License
/** * Concatenate the content of all HDFS files matching {@code hdfsGlob} into a * file {@code localFilename}.//from w w w .j av a 2 s . co m * * @param hdfsNameNode The name of the Hadoop name node. * @param hdfsGlob Files matching this pattern will be fetched. * @param localFilename Name of local file to store concatenated content. * @return The newly created file. * @throws IOException when the file cannot be created/written. */ public static File getHdfsFiles(String hdfsNameNode, String hdfsGlob, String localFilename) throws IOException { Preconditions.checkNotNull(localFilename); Preconditions.checkNotNull(hdfsGlob); Preconditions.checkNotNull(hdfsNameNode); // init the FS connection and the local file. Configuration config = new Configuration(); config.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, hdfsNameNode); FileSystem dfs = FileSystem.get(config); File localFile = new File(localFilename); FileOutputStream localStream = new FileOutputStream(localFile); // get files that need downloading. FileStatus[] statuses = dfs.globStatus(new Path(hdfsGlob)); LOG.info("Pattern " + hdfsGlob + " matched " + statuses.length + " HDFS files, " + "fetching to " + localFile.getCanonicalPath() + "..."); // append each file. int copiedChars = 0; FSDataInputStream remoteStream = null; for (FileStatus status : statuses) { Path src = status.getPath(); try { remoteStream = dfs.open(src); copiedChars += IOUtils.copy(remoteStream, localStream); } catch (IOException e) { LOG.severe("Failed to open/copy " + src); } finally { IOUtils.closeQuietly(remoteStream); } } LOG.info("Fetch " + copiedChars + " bytes to local FS"); return localFile; }
From source file:com.twitter.elephanttwin.util.HdfsUtils.java
License:Apache License
/** * Returns {@link FileStatus} instances for all part files beneath the given parent URI. * * @param fs file system with which to retrieve part file status. * @param parent the parent URI within which part files should be globbed. * @return status of part files within parent URI. * @throws IOException/*from w w w .j a v a 2 s . c o m*/ */ public static List<FileStatus> partFileStatus(FileSystem fs, URI parent) throws IOException { return Lists.newArrayList(fs.globStatus(new Path(new Path(parent), PART_FILE_GLOB))); }
From source file:com.uber.hoodie.cli.commands.FileSystemViewCommand.java
License:Apache License
/** * Build File System View//from w w w. j a va 2s .co m * @param globRegex Path Regex * @param maxInstant Max Instants to be used for displaying file-instants * @param readOptimizedOnly Include only read optimized view * @param includeMaxInstant Include Max instant * @param includeInflight Include inflight instants * @param excludeCompaction Exclude Compaction instants * @return * @throws IOException */ private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean readOptimizedOnly, boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieCLI.tableMetadata.getHadoopConf(), HoodieCLI.tableMetadata.getBasePath(), true); FileSystem fs = HoodieCLI.fs; String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex); FileStatus[] statuses = fs.globStatus(new Path(globPath)); Stream<HoodieInstant> instantsStream = null; HoodieTimeline timeline = null; if (readOptimizedOnly) { timeline = metaClient.getActiveTimeline().getCommitTimeline(); } else if (excludeCompaction) { timeline = metaClient.getActiveTimeline().getCommitsTimeline(); } else { timeline = metaClient.getActiveTimeline().getCommitsAndCompactionTimeline(); } if (!includeInflight) { timeline = timeline.filterCompletedInstants(); } instantsStream = timeline.getInstants(); if (!maxInstant.isEmpty()) { final BiPredicate<String, String> predicate; if (includeMaxInstant) { predicate = HoodieTimeline.GREATER_OR_EQUAL; } else { predicate = HoodieTimeline.GREATER; } instantsStream = instantsStream.filter(is -> predicate.test(maxInstant, is.getTimestamp())); } HoodieTimeline filteredTimeline = new HoodieDefaultTimeline(instantsStream, (Function<HoodieInstant, Optional<byte[]>> & Serializable) metaClient .getActiveTimeline()::getInstantDetails); return new HoodieTableFileSystemView(metaClient, filteredTimeline, statuses); }
From source file:com.uber.hoodie.cli.commands.HoodieLogFileCommand.java
License:Apache License
@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files") public String showLogFileCommits( @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final String logFilePathPattern, @CliOption(key = {/*from www .j a v a2 s .co m*/ "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { FileSystem fs = HoodieCLI.tableMetadata.getFs(); List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern))) .map(status -> status.getPath().toString()).collect(Collectors.toList()); Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata = Maps .newHashMap(); int totalEntries = 0; int numCorruptBlocks = 0; int dummyInstantTimeCount = 0; for (String logFilePath : logFilePaths) { FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath)); Schema writerSchema = new AvroSchemaConverter().convert( SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFilePath))); HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema); // read the avro blocks while (reader.hasNext()) { HoodieLogBlock n = reader.next(); String instantTime; int recordCount = 0; if (n instanceof HoodieCorruptBlock) { try { instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME); if (instantTime == null) { throw new Exception("Invalid instant time " + instantTime); } } catch (Exception e) { numCorruptBlocks++; instantTime = "corrupt_block_" + numCorruptBlocks; // could not read metadata for corrupt block } } else { instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME); if (instantTime == null) { // This can happen when reading archived commit files since they were written without any instant time dummyInstantTimeCount++; instantTime = "dummy_instant_time_" + dummyInstantTimeCount; } if (n instanceof HoodieAvroDataBlock) { recordCount = ((HoodieAvroDataBlock) n).getRecords().size(); } } if (commitCountAndMetadata.containsKey(instantTime)) { commitCountAndMetadata.get(instantTime).add(new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); totalEntries++; } else { List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list = new ArrayList<>(); list.add(new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount)); commitCountAndMetadata.put(instantTime, list); totalEntries++; } } reader.close(); } List<Comparable[]> rows = new ArrayList<>(); int i = 0; ObjectMapper objectMapper = new ObjectMapper(); for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata .entrySet()) { String instantTime = entry.getKey().toString(); for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry .getValue()) { Comparable[] output = new Comparable[5]; output[0] = instantTime; output[1] = tuple3._3(); output[2] = tuple3._1().toString(); output[3] = objectMapper.writeValueAsString(tuple3._2()._1()); output[4] = objectMapper.writeValueAsString(tuple3._2()._2()); rows.add(output); i++; } } TableHeader header = new TableHeader().addTableHeaderField("InstantTime").addTableHeaderField("RecordCount") .addTableHeaderField("BlockType").addTableHeaderField("HeaderMetadata") .addTableHeaderField("FooterMetadata"); return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows); }
From source file:com.uber.hoodie.cli.commands.HoodieLogFileCommand.java
License:Apache License
@CliCommand(value = "show logfile records", help = "Read records from log files") public String showLogFileRecords(@CliOption(key = { "limit" }, mandatory = false, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") final String logFilePathPattern, @CliOption(key = "mergeRecords", mandatory = false, help = "If the records in the log files should be merged", unspecifiedDefaultValue = "false") final Boolean shouldMerge) throws IOException { System.out.println("===============> Showing only " + limit + " records <==============="); FileSystem fs = HoodieCLI.tableMetadata.getFs(); List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(logFilePathPattern))) .map(status -> status.getPath().toString()).collect(Collectors.toList()); // TODO : readerSchema can change across blocks/log files, fix this inside Scanner AvroSchemaConverter converter = new AvroSchemaConverter(); // get schema from last log file Schema readerSchema = converter .convert(SchemaUtil.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))); List<IndexedRecord> allRecords = new ArrayList<>(); if (shouldMerge) { System.out.println("===========================> MERGING RECORDS <==================="); HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, HoodieCLI.tableMetadata.getBasePath(), logFilePaths, readerSchema, HoodieCLI.tableMetadata.getActiveTimeline().getCommitTimeline().lastInstant().get() .getTimestamp(), Long.valueOf(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES), Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_LAZY_BLOCK_READ_ENABLED), Boolean.valueOf(HoodieCompactionConfig.DEFAULT_COMPACTION_REVERSE_LOG_READ_ENABLED), Integer.valueOf(HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE), HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH); for (HoodieRecord<? extends HoodieRecordPayload> hoodieRecord : scanner) { Optional<IndexedRecord> record = hoodieRecord.getData().getInsertValue(readerSchema); if (allRecords.size() >= limit) { break; }/*from w ww.j av a2s . c o m*/ allRecords.add(record.get()); } } else { for (String logFile : logFilePaths) { Schema writerSchema = new AvroSchemaConverter().convert( SchemaUtil.readSchemaFromLogFile(HoodieCLI.tableMetadata.getFs(), new Path(logFile))); HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema); // read the avro blocks while (reader.hasNext()) { HoodieLogBlock n = reader.next(); if (n instanceof HoodieAvroDataBlock) { HoodieAvroDataBlock blk = (HoodieAvroDataBlock) n; List<IndexedRecord> records = blk.getRecords(); allRecords.addAll(records); if (allRecords.size() >= limit) { break; } } } reader.close(); if (allRecords.size() >= limit) { break; } } } String[][] rows = new String[allRecords.size() + 1][]; int i = 0; for (IndexedRecord record : allRecords) { String[] data = new String[1]; data[0] = record.toString(); rows[i] = data; i++; } return HoodiePrintHelper.print(new String[] { "Records" }, rows); }
From source file:com.uber.hoodie.cli.commands.StatsCommand.java
License:Apache License
@CliCommand(value = "stats filesizes", help = "File Sizes. Display summary stats on sizes of files") public String fileSizeStats(@CliOption(key = { "partitionPath" }, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") final String globRegex, @CliOption(key = {/* ww w. j a va 2 s . c om*/ "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException { FileSystem fs = HoodieCLI.fs; String globPath = String.format("%s/%s/*", HoodieCLI.tableMetadata.getBasePath(), globRegex); FileStatus[] statuses = fs.globStatus(new Path(globPath)); // max, min, #small files < 10MB, 50th, avg, 95th Histogram globalHistogram = new Histogram(new UniformReservoir(MAX_FILES)); HashMap<String, Histogram> commitHistoMap = new HashMap<String, Histogram>(); for (FileStatus fileStatus : statuses) { String commitTime = FSUtils.getCommitTime(fileStatus.getPath().getName()); long sz = fileStatus.getLen(); if (!commitHistoMap.containsKey(commitTime)) { commitHistoMap.put(commitTime, new Histogram(new UniformReservoir(MAX_FILES))); } commitHistoMap.get(commitTime).update(sz); globalHistogram.update(sz); } List<Comparable[]> rows = new ArrayList<>(); int ind = 0; for (String commitTime : commitHistoMap.keySet()) { Snapshot s = commitHistoMap.get(commitTime).getSnapshot(); rows.add(printFileSizeHistogram(commitTime, s)); } Snapshot s = globalHistogram.getSnapshot(); rows.add(printFileSizeHistogram("ALL", s)); Function<Object, String> converterFunction = entry -> NumericUtils .humanReadableByteCount((Double.valueOf(entry.toString()))); Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>(); fieldNameToConverterMap.put("Min", converterFunction); fieldNameToConverterMap.put("10th", converterFunction); fieldNameToConverterMap.put("50th", converterFunction); fieldNameToConverterMap.put("avg", converterFunction); fieldNameToConverterMap.put("95th", converterFunction); fieldNameToConverterMap.put("Max", converterFunction); fieldNameToConverterMap.put("StdDev", converterFunction); TableHeader header = new TableHeader().addTableHeaderField("CommitTime").addTableHeaderField("Min") .addTableHeaderField("10th").addTableHeaderField("50th").addTableHeaderField("avg") .addTableHeaderField("95th").addTableHeaderField("Max").addTableHeaderField("NumFiles") .addTableHeaderField("StdDev"); return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows); }
From source file:com.uber.hoodie.common.HoodieClientTestUtils.java
License:Apache License
/** * Reads the paths under the a hoodie dataset out as a DataFrame *///from www . j a va 2 s. c o m public static Dataset<Row> read(JavaSparkContext jsc, String basePath, SQLContext sqlContext, FileSystem fs, String... paths) { List<String> filteredPaths = new ArrayList<>(); try { HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath, true); for (String path : paths) { TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), fs.globStatus(new Path(path))); List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList()); for (HoodieDataFile file : latestFiles) { filteredPaths.add(file.getPath()); } } return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()])); } catch (Exception e) { throw new HoodieException("Error reading hoodie dataset as a dataframe", e); } }
From source file:com.yahoo.glimmer.util.MapReducePartInputStreamEnumeration.java
License:Open Source License
public MapReducePartInputStreamEnumeration(FileSystem fileSystem, Path srcPath) throws IOException { this.fileSystem = fileSystem; CompressionCodecFactory factory = new CompressionCodecFactory(fileSystem.getConf()); codecIfAny = factory.getCodec(srcPath); FileStatus srcFileStatus = fileSystem.getFileStatus(srcPath); if (srcFileStatus.isDirectory()) { // returns FileStatus objects sorted by filename. String partFilenamePattern = "part-?-?????"; if (codecIfAny != null) { partFilenamePattern += codecIfAny.getDefaultExtension(); }/* w w w . j a v a 2s . c om*/ Path partPathGlob = new Path(srcPath, partFilenamePattern); partFileStatuses = fileSystem.globStatus(partPathGlob); } else { partFileStatuses = new FileStatus[] { srcFileStatus }; } }
From source file:com.yahoo.glimmer.util.MergeSortTool.java
License:Open Source License
@Override public int run(String[] args) throws Exception { SimpleJSAP jsap = new SimpleJSAP(MergeSortTool.class.getName(), "Merges alpha numerically sorted text files on HDFS", new Parameter[] { new FlaggedOption(INPUT_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'i', INPUT_ARG, "input filenames glob eg. .../part-r-?????/sortedlines.text"), new FlaggedOption(OUTPUT_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, 'o', OUTPUT_ARG, "output filename"), new FlaggedOption(COUNT_ARG, JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', COUNT_ARG,/*from w w w .ja v a 2 s . c o m*/ "optionally create a file containing a count of the number of lines merged in text"), }); JSAPResult jsapResult = jsap.parse(args); if (!jsapResult.success()) { System.err.print(jsap.getUsage()); System.exit(1); } // FileSystem fs = FileSystem.get(getConf()); // CompressionCodecFactory factory = new // CompressionCodecFactory(getConf()); // mergeSort(fs, sourcePaths, outputPath, factory); // Maybe quicker to use a MR job with one reducer.. Currently // decompression, merge and compression are all done in this thread.. Path inputGlobPath = new Path(jsapResult.getString(INPUT_ARG)); Configuration config = getConf(); FileSystem fs = FileSystem.get(config); FileStatus[] sources = fs.globStatus(inputGlobPath); if (sources.length == 0) { System.err.println("No files matching input glob:" + inputGlobPath.toString()); return 1; } List<Path> sourcePaths = new ArrayList<Path>(sources.length); for (FileStatus source : sources) { if (source.isDirectory()) { System.err.println(source.getPath().toString() + " is a directory."); return 1; } sourcePaths.add(source.getPath()); } Path outputPath = new Path(jsapResult.getString(OUTPUT_ARG)); CompressionCodecFactory factory = new CompressionCodecFactory(config); FSDataOutputStream countsOutputStream = null; if (jsapResult.contains(COUNT_ARG)) { Path countsPath = null; countsPath = new Path(jsapResult.getString(COUNT_ARG)); countsOutputStream = fs.create(countsPath); } int lineCount = MergeSortTool.mergeSort(fs, sourcePaths, outputPath, factory); System.out.println("Merged " + lineCount + " lines into " + outputPath.toString()); if (countsOutputStream != null) { countsOutputStream.writeBytes("" + lineCount + '\n'); } countsOutputStream.flush(); countsOutputStream.close(); return 0; }