List of usage examples for org.apache.hadoop.fs Path SEPARATOR
String SEPARATOR
To view the source code for org.apache.hadoop.fs Path SEPARATOR.
Click Source Link
From source file:gobblin.service.modules.core.GobblinServiceManager.java
License:Apache License
private Path getServiceWorkDirPath(FileSystem fs, String serviceName, String serviceId) { return new Path(fs.getHomeDirectory(), serviceName + Path.SEPARATOR + serviceId); }
From source file:gobblin.source.DatePartitionedAvroFileSource.java
License:Apache License
private Path constructSourcePath(DateTime date) { StringBuilder pathBuilder = new StringBuilder(); if (!this.sourcePartitionPrefix.isEmpty()) { pathBuilder.append(this.sourcePartitionPrefix); pathBuilder.append(Path.SEPARATOR); }//from w w w . j av a 2 s .c o m pathBuilder.append(this.partitionPatternFormatter.print(date)); if (!this.sourcePartitionSuffix.isEmpty()) { pathBuilder.append(Path.SEPARATOR); pathBuilder.append(this.sourcePartitionSuffix); } return new Path(this.sourceDir, pathBuilder.toString()); }
From source file:gobblin.source.DatePartitionedDailyAvroSource.java
License:Open Source License
/** * Helper method to add new {@link WorkUnit}s for this job. It iterates through a date partitioned directory and * creates a {@link WorkUnit} for each file that needs to be processed. It then adds that {@link WorkUnit} to a * {@link MultiWorkUnitWeightedQueue}/* w ww. jav a 2s . com*/ */ private void addNewWorkUnits(MultiWorkUnitWeightedQueue multiWorkUnitWeightedQueue) { DateTime currentDay = new DateTime(); DateTime lowWaterMarkDate = new DateTime(this.lowWaterMark); String topicName = this.sourceDir.getName(); // Process all data from the lowWaterMark date until the maxFilesPerJob has been hit for (DateTime date = lowWaterMarkDate; !date.isAfter(currentDay) && this.fileCount < this.maxFilesPerJob; date = date.plusDays(1)) { // Construct a daily path folder - e.g. /my/data/daily/2015/01/01/ Path dayPath = new Path(this.sourceDir, DAILY_FOLDER_NAME + Path.SEPARATOR + DAILY_FOLDER_FORMATTER.print(date)); try { if (this.fs.exists(dayPath)) { // Create an extract object for the dayPath SourceState partitionState = new SourceState(); partitionState.addAll(this.sourceState); partitionState.setProp(ConfigurationKeys.SOURCE_ENTITY, topicName); Extract extract = partitionState.createExtract(this.tableType, partitionState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), topicName); LOG.info("Created extract: " + extract.getExtractId() + " for path " + dayPath); // Create a WorkUnit for each file in the folder for (FileStatus fileStatus : this.fs.listStatus(dayPath, getFileFilter())) { LOG.info("Will process file " + fileStatus.getPath()); partitionState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, fileStatus.getPath()); partitionState.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, date.getMillis()); partitionState.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, date.getMillis()); WorkUnit singleWorkUnit = partitionState.createWorkUnit(extract); multiWorkUnitWeightedQueue.addWorkUnit(singleWorkUnit, fileStatus.getLen()); this.fileCount++; } } else { LOG.info("Path " + dayPath + " does not exist, skipping"); } } catch (IOException e) { Throwables.propagate(e); } } LOG.info("Total number of files extracted for the current run: " + this.fileCount); }
From source file:gobblin.source.extractor.DatePartitionedAvroFileExtractorTest.java
License:Open Source License
@Test public void testReadPartitionsByMinute() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.prefix", PREFIX); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); verifyWorkUnits(workunits);/*w w w .j av a 2s . c o m*/ }
From source file:gobblin.source.extractor.DatePartitionedAvroFileExtractorTest.java
License:Open Source License
@Test public void testWorksNoPrefix() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY + Path.SEPARATOR + PREFIX); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); verifyWorkUnits(workunits);//from w w w . j av a 2s . co m }
From source file:gobblin.source.extractor.extract.google.GoogleDriveFileSystem.java
License:Apache License
/** * org.apache.hadoop.fs.Path assumes that there separator in file system naming and "/" is the separator. * When org.apache.hadoop.fs.Path sees "/" in path String, it splits into parent and name. As fileID is a random * String determined by Google and it can contain "/" itself, this method check if parent and name is separated and * restore "/" back to file ID.// w w w. j a v a 2 s . c o m * * @param p * @return */ public static String toFileId(Path p) { if (p.isRoot()) { return ""; } final String format = "%s" + Path.SEPARATOR + "%s"; if (p.getParent() != null && StringUtils.isEmpty(p.getParent().getName())) { return p.getName(); } return String.format(format, toFileId(p.getParent()), p.getName()); }
From source file:gobblin.util.AvroUtils.java
License:Apache License
/** * Serialize a generic record as a relative {@link Path}. Useful for converting {@link GenericRecord} type keys * into file system locations. For example {field1=v1, field2=v2} returns field1=v1/field2=v2 if includeFieldNames * is true, or v1/v2 if it is false. Illegal HDFS tokens such as ':' and '\\' will be replaced with '_'. * Additionally, parameter replacePathSeparators controls whether to replace path separators ('/') with '_'. * * @param record {@link GenericRecord} to serialize. * @param includeFieldNames If true, each token in the path will be of the form key=value, otherwise, only the value * will be included. * @param replacePathSeparators If true, path separators ('/') in each token will be replaced with '_'. * @return A relative path where each level is a field in the input record. *//*from ww w.j av a2 s. c om*/ public static Path serializeAsPath(GenericRecord record, boolean includeFieldNames, boolean replacePathSeparators) { if (record == null) { return new Path(""); } List<String> tokens = Lists.newArrayList(); for (Schema.Field field : record.getSchema().getFields()) { String sanitizedName = HadoopUtils.sanitizePath(field.name(), "_"); String sanitizedValue = HadoopUtils.sanitizePath(record.get(field.name()).toString(), "_"); if (replacePathSeparators) { sanitizedName = sanitizedName.replaceAll(Path.SEPARATOR, "_"); sanitizedValue = sanitizedValue.replaceAll(Path.SEPARATOR, "_"); } if (includeFieldNames) { tokens.add(String.format("%s=%s", sanitizedName, sanitizedValue)); } else if (!Strings.isNullOrEmpty(sanitizedValue)) { tokens.add(sanitizedValue); } } return new Path(Joiner.on(Path.SEPARATOR).join(tokens)); }
From source file:gobblin.util.ForkOperatorUtils.java
License:Apache License
/** * Get a new path with the given branch name as a sub directory. * * @param numBranches number of branches (non-negative) * @param branchId branch id (non-negative) * @return a new path/* w w w .j a va 2s . c om*/ */ public static String getPathForBranch(State state, String path, int numBranches, int branchId) { Preconditions.checkNotNull(state); Preconditions.checkNotNull(path); Preconditions.checkArgument(numBranches >= 0, "The number of branches is expected to be non-negative"); Preconditions.checkArgument(branchId >= 0, "The branch id is expected to be non-negative"); return numBranches > 1 ? path + Path.SEPARATOR + state.getProp(ConfigurationKeys.FORK_BRANCH_NAME_KEY + "." + branchId, ConfigurationKeys.DEFAULT_FORK_BRANCH_NAME + branchId) : path; }
From source file:gobblin.util.HadoopUtils.java
License:Apache License
/** * A thread safe variation of {@link #renamePath(FileSystem, Path, Path)} which can be used in * multi-threaded/multi-mapper environment. The rename operation always happens at file level hence directories are * not overwritten under the 'to' path./*from w ww. j ava2s.c o m*/ * * <p> * If the contents of destination 'to' path is not expected to be modified concurrently, use * {@link #renamePath(FileSystem, Path, Path)} which is faster and more optimized * </p> * * <b>NOTE: This does not seem to be working for all {@link FileSystem} implementations. Use * {@link #renameRecursively(FileSystem, Path, Path)}</b> * * @param fileSystem on which the data needs to be moved * @param from path of the data to be moved * @param to path of the data to be moved * */ public static void safeRenameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException { for (FileStatus fromFile : FileListUtils.listFilesRecursively(fileSystem, from)) { Path relativeFilePath = new Path( StringUtils.substringAfter(fromFile.getPath().toString(), from.toString() + Path.SEPARATOR)); Path toFilePath = new Path(to, relativeFilePath); if (!fileSystem.exists(toFilePath)) { boolean renamed = false; // underlying file open can fail with file not found error due to some race condition // when the parent directory is created in another thread, so retry a few times for (int i = 0; !renamed && i < MAX_RENAME_TRIES; i++) { try { renamed = fileSystem.rename(fromFile.getPath(), toFilePath); break; } catch (FileNotFoundException e) { if (i + 1 >= MAX_RENAME_TRIES) { throw e; } } } if (!renamed) { throw new IOException( String.format("Failed to rename %s to %s.", fromFile.getPath(), toFilePath)); } log.info(String.format("Renamed %s to %s", fromFile.getPath(), toFilePath)); } else { log.info(String.format("File already exists %s. Will not rewrite", toFilePath)); } } }
From source file:gobblin.util.io.StreamUtils.java
License:Apache License
/** * Convert a {@link Path} to a {@link String} and make sure it is properly formatted to be recognized as a directory * by {@link TarArchiveEntry}.//from ww w . java 2s. co m */ private static String formatPathToDir(Path path) { return path.toString().endsWith(Path.SEPARATOR) ? path.toString() : path.toString() + Path.SEPARATOR; }