Example usage for org.apache.hadoop.fs Path SEPARATOR

List of usage examples for org.apache.hadoop.fs Path SEPARATOR

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path SEPARATOR.

Prototype

String SEPARATOR

To view the source code for org.apache.hadoop.fs Path SEPARATOR.

Click Source Link

Document

The directory separator, a slash.

Usage

From source file:gobblin.service.modules.core.GobblinServiceManager.java

License:Apache License

private Path getServiceWorkDirPath(FileSystem fs, String serviceName, String serviceId) {
    return new Path(fs.getHomeDirectory(), serviceName + Path.SEPARATOR + serviceId);
}

From source file:gobblin.source.DatePartitionedAvroFileSource.java

License:Apache License

private Path constructSourcePath(DateTime date) {
    StringBuilder pathBuilder = new StringBuilder();

    if (!this.sourcePartitionPrefix.isEmpty()) {
        pathBuilder.append(this.sourcePartitionPrefix);
        pathBuilder.append(Path.SEPARATOR);
    }//from  w  w w  .  j av a  2 s  .c o  m

    pathBuilder.append(this.partitionPatternFormatter.print(date));

    if (!this.sourcePartitionSuffix.isEmpty()) {
        pathBuilder.append(Path.SEPARATOR);
        pathBuilder.append(this.sourcePartitionSuffix);
    }

    return new Path(this.sourceDir, pathBuilder.toString());
}

From source file:gobblin.source.DatePartitionedDailyAvroSource.java

License:Open Source License

/**
 * Helper method to add new {@link WorkUnit}s for this job. It iterates through a date partitioned directory and
 * creates a {@link WorkUnit} for each file that needs to be processed. It then adds that {@link WorkUnit} to a
 * {@link MultiWorkUnitWeightedQueue}/* w ww.  jav  a  2s  . com*/
 */
private void addNewWorkUnits(MultiWorkUnitWeightedQueue multiWorkUnitWeightedQueue) {

    DateTime currentDay = new DateTime();
    DateTime lowWaterMarkDate = new DateTime(this.lowWaterMark);
    String topicName = this.sourceDir.getName();

    // Process all data from the lowWaterMark date until the maxFilesPerJob has been hit
    for (DateTime date = lowWaterMarkDate; !date.isAfter(currentDay)
            && this.fileCount < this.maxFilesPerJob; date = date.plusDays(1)) {

        // Construct a daily path folder - e.g. /my/data/daily/2015/01/01/
        Path dayPath = new Path(this.sourceDir,
                DAILY_FOLDER_NAME + Path.SEPARATOR + DAILY_FOLDER_FORMATTER.print(date));

        try {
            if (this.fs.exists(dayPath)) {

                // Create an extract object for the dayPath
                SourceState partitionState = new SourceState();

                partitionState.addAll(this.sourceState);
                partitionState.setProp(ConfigurationKeys.SOURCE_ENTITY, topicName);

                Extract extract = partitionState.createExtract(this.tableType,
                        partitionState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), topicName);

                LOG.info("Created extract: " + extract.getExtractId() + " for path " + dayPath);

                // Create a WorkUnit for each file in the folder
                for (FileStatus fileStatus : this.fs.listStatus(dayPath, getFileFilter())) {

                    LOG.info("Will process file " + fileStatus.getPath());

                    partitionState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL,
                            fileStatus.getPath());
                    partitionState.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, date.getMillis());
                    partitionState.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, date.getMillis());

                    WorkUnit singleWorkUnit = partitionState.createWorkUnit(extract);

                    multiWorkUnitWeightedQueue.addWorkUnit(singleWorkUnit, fileStatus.getLen());

                    this.fileCount++;
                }
            } else {
                LOG.info("Path " + dayPath + " does not exist, skipping");
            }
        } catch (IOException e) {
            Throwables.propagate(e);
        }
    }

    LOG.info("Total number of files extracted for the current run: " + this.fileCount);
}

From source file:gobblin.source.extractor.DatePartitionedAvroFileExtractorTest.java

License:Open Source License

@Test
public void testReadPartitionsByMinute() throws IOException, DataRecordException {

    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();

    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY,
            OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);

    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value",
            DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);

    //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
    List<WorkUnit> workunits = source.getWorkunits(state);

    Assert.assertEquals(workunits.size(), 4);
    verifyWorkUnits(workunits);/*w  w  w .j av a  2s .  c  o  m*/
}

From source file:gobblin.source.extractor.DatePartitionedAvroFileExtractorTest.java

License:Open Source License

@Test
public void testWorksNoPrefix() throws IOException, DataRecordException {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();

    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY,
            OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY + Path.SEPARATOR + PREFIX);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);

    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value",
            DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);

    //Read data partitioned by minutes, i.e each workunit is assigned records under the same YYYY/MM/dd/HH_mm directory
    List<WorkUnit> workunits = source.getWorkunits(state);

    Assert.assertEquals(workunits.size(), 4);
    verifyWorkUnits(workunits);//from w  w w  .  j av  a  2s . co  m
}

From source file:gobblin.source.extractor.extract.google.GoogleDriveFileSystem.java

License:Apache License

/**
 * org.apache.hadoop.fs.Path assumes that there separator in file system naming and "/" is the separator.
 * When org.apache.hadoop.fs.Path sees "/" in path String, it splits into parent and name. As fileID is a random
 * String determined by Google and it can contain "/" itself, this method check if parent and name is separated and
 * restore "/" back to file ID.//  w  w  w. j a  v  a 2  s .  c o  m
 *
 * @param p
 * @return
 */
public static String toFileId(Path p) {
    if (p.isRoot()) {
        return "";
    }
    final String format = "%s" + Path.SEPARATOR + "%s";
    if (p.getParent() != null && StringUtils.isEmpty(p.getParent().getName())) {
        return p.getName();
    }
    return String.format(format, toFileId(p.getParent()), p.getName());
}

From source file:gobblin.util.AvroUtils.java

License:Apache License

/**
 * Serialize a generic record as a relative {@link Path}. Useful for converting {@link GenericRecord} type keys
 * into file system locations. For example {field1=v1, field2=v2} returns field1=v1/field2=v2 if includeFieldNames
 * is true, or v1/v2 if it is false. Illegal HDFS tokens such as ':' and '\\' will be replaced with '_'.
 * Additionally, parameter replacePathSeparators controls whether to replace path separators ('/') with '_'.
 *
 * @param record {@link GenericRecord} to serialize.
 * @param includeFieldNames If true, each token in the path will be of the form key=value, otherwise, only the value
 *                          will be included.
 * @param replacePathSeparators If true, path separators ('/') in each token will be replaced with '_'.
 * @return A relative path where each level is a field in the input record.
 *//*from  ww  w.j av a2  s. c  om*/
public static Path serializeAsPath(GenericRecord record, boolean includeFieldNames,
        boolean replacePathSeparators) {
    if (record == null) {
        return new Path("");
    }
    List<String> tokens = Lists.newArrayList();
    for (Schema.Field field : record.getSchema().getFields()) {
        String sanitizedName = HadoopUtils.sanitizePath(field.name(), "_");
        String sanitizedValue = HadoopUtils.sanitizePath(record.get(field.name()).toString(), "_");
        if (replacePathSeparators) {
            sanitizedName = sanitizedName.replaceAll(Path.SEPARATOR, "_");
            sanitizedValue = sanitizedValue.replaceAll(Path.SEPARATOR, "_");
        }
        if (includeFieldNames) {
            tokens.add(String.format("%s=%s", sanitizedName, sanitizedValue));
        } else if (!Strings.isNullOrEmpty(sanitizedValue)) {
            tokens.add(sanitizedValue);
        }
    }
    return new Path(Joiner.on(Path.SEPARATOR).join(tokens));
}

From source file:gobblin.util.ForkOperatorUtils.java

License:Apache License

/**
 * Get a new path with the given branch name as a sub directory.
 *
 * @param numBranches number of branches (non-negative)
 * @param branchId    branch id (non-negative)
 * @return a new path/* w w  w .j  a  va  2s  .  c om*/
 */
public static String getPathForBranch(State state, String path, int numBranches, int branchId) {
    Preconditions.checkNotNull(state);
    Preconditions.checkNotNull(path);
    Preconditions.checkArgument(numBranches >= 0, "The number of branches is expected to be non-negative");
    Preconditions.checkArgument(branchId >= 0, "The branch id is expected to be non-negative");

    return numBranches > 1
            ? path + Path.SEPARATOR + state.getProp(ConfigurationKeys.FORK_BRANCH_NAME_KEY + "." + branchId,
                    ConfigurationKeys.DEFAULT_FORK_BRANCH_NAME + branchId)
            : path;
}

From source file:gobblin.util.HadoopUtils.java

License:Apache License

/**
 * A thread safe variation of {@link #renamePath(FileSystem, Path, Path)} which can be used in
 * multi-threaded/multi-mapper environment. The rename operation always happens at file level hence directories are
 * not overwritten under the 'to' path./*from  w ww.  j  ava2s.c  o  m*/
 *
 * <p>
 * If the contents of destination 'to' path is not expected to be modified concurrently, use
 * {@link #renamePath(FileSystem, Path, Path)} which is faster and more optimized
 * </p>
 *
 * <b>NOTE: This does not seem to be working for all {@link FileSystem} implementations. Use
 * {@link #renameRecursively(FileSystem, Path, Path)}</b>
 *
 * @param fileSystem on which the data needs to be moved
 * @param from path of the data to be moved
 * @param to path of the data to be moved
 *
 */
public static void safeRenameRecursively(FileSystem fileSystem, Path from, Path to) throws IOException {

    for (FileStatus fromFile : FileListUtils.listFilesRecursively(fileSystem, from)) {

        Path relativeFilePath = new Path(
                StringUtils.substringAfter(fromFile.getPath().toString(), from.toString() + Path.SEPARATOR));

        Path toFilePath = new Path(to, relativeFilePath);

        if (!fileSystem.exists(toFilePath)) {
            boolean renamed = false;

            // underlying file open can fail with file not found error due to some race condition
            // when the parent directory is created in another thread, so retry a few times
            for (int i = 0; !renamed && i < MAX_RENAME_TRIES; i++) {
                try {
                    renamed = fileSystem.rename(fromFile.getPath(), toFilePath);
                    break;
                } catch (FileNotFoundException e) {
                    if (i + 1 >= MAX_RENAME_TRIES) {
                        throw e;
                    }
                }
            }

            if (!renamed) {
                throw new IOException(
                        String.format("Failed to rename %s to %s.", fromFile.getPath(), toFilePath));
            }
            log.info(String.format("Renamed %s to %s", fromFile.getPath(), toFilePath));
        } else {
            log.info(String.format("File already exists %s. Will not rewrite", toFilePath));
        }
    }
}

From source file:gobblin.util.io.StreamUtils.java

License:Apache License

/**
 * Convert a {@link Path} to a {@link String} and make sure it is properly formatted to be recognized as a directory
 * by {@link TarArchiveEntry}.//from   ww w . java  2s.  co  m
 */
private static String formatPathToDir(Path path) {
    return path.toString().endsWith(Path.SEPARATOR) ? path.toString() : path.toString() + Path.SEPARATOR;
}