Example usage for org.apache.hadoop.fs Path getParent

List of usage examples for org.apache.hadoop.fs Path getParent

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getParent.

Prototype

public Path getParent() 

Source Link

Document

Returns the parent of a path or null if at root.

Usage

From source file:org.apache.gobblin.data.management.copy.CopyableFile.java

License:Apache License

/**
 * Set file system based source and destination dataset for this {@link CopyableFile}
 *
 * @param originFs {@link FileSystem} where this {@link CopyableFile} origins
 * @param targetFs {@link FileSystem} where this {@link CopyableFile} is copied to
 *//*from ww w  .j  a va2 s . c o m*/
public void setFsDatasets(FileSystem originFs, FileSystem targetFs) {
    /*
     * By default, the raw Gobblin dataset for CopyableFile lineage is its parent folder
     * if itself is not a folder
     */
    boolean isDir = origin.isDirectory();

    Path fullSourcePath = Path.getPathWithoutSchemeAndAuthority(origin.getPath());
    String sourceDatasetName = isDir ? fullSourcePath.toString() : fullSourcePath.getParent().toString();
    DatasetDescriptor sourceDataset = new DatasetDescriptor(originFs.getScheme(), sourceDatasetName);
    sourceDataset.addMetadata(DatasetConstants.FS_URI, originFs.getUri().toString());
    sourceData = sourceDataset;

    Path fullDestinationPath = Path.getPathWithoutSchemeAndAuthority(destination);
    String destinationDatasetName = isDir ? fullDestinationPath.toString()
            : fullDestinationPath.getParent().toString();
    DatasetDescriptor destinationDataset = new DatasetDescriptor(targetFs.getScheme(), destinationDatasetName);
    destinationDataset.addMetadata(DatasetConstants.FS_URI, targetFs.getUri().toString());
    destinationData = destinationDataset;
}

From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter.java

License:Apache License

@Override
public final void writeImpl(FileAwareInputStream fileAwareInputStream) throws IOException {
    CopyableFile copyableFile = fileAwareInputStream.getFile();
    if (encryptionConfig != null) {
        copyableFile.setDestination(PathUtils.addExtension(copyableFile.getDestination(),
                "." + EncryptionConfigParser.getEncryptionType(encryptionConfig)));
    }//w  ww.j a  va  2s.co  m
    Path stagingFile = getStagingFilePath(copyableFile);
    if (this.actualProcessedCopyableFile.isPresent()) {
        throw new IOException(this.getClass().getCanonicalName() + " can only process one file.");
    }
    this.actualProcessedCopyableFile = Optional.of(copyableFile);
    this.fs.mkdirs(stagingFile.getParent());
    writeImpl(fileAwareInputStream.getInputStream(), stagingFile, copyableFile, fileAwareInputStream);
    this.filesWritten.incrementAndGet();
}

From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter.java

License:Apache License

/**
 * Moves the file from task staging to task output. Each task has its own staging directory but all the tasks share
 * the same task output directory.//w  w w .ja va  2 s .  com
 *
 * {@inheritDoc}
 *
 * @see DataWriter#commit()
 */
@Override
public void commit() throws IOException {

    if (!this.actualProcessedCopyableFile.isPresent()) {
        return;
    }

    CopyableFile copyableFile = this.actualProcessedCopyableFile.get();
    Path stagingFilePath = getStagingFilePath(copyableFile);
    Path outputFilePath = getSplitOutputFilePath(copyableFile, this.outputDir,
            copyableFile.getDatasetAndPartition(this.copyableDatasetMetadata), this.state);

    log.info(String.format("Committing data from %s to %s", stagingFilePath, outputFilePath));
    try {
        setFilePermissions(copyableFile);

        Iterator<OwnerAndPermission> ancestorOwnerAndPermissionIt = copyableFile
                .getAncestorsOwnerAndPermission() == null ? Iterators.<OwnerAndPermission>emptyIterator()
                        : copyableFile.getAncestorsOwnerAndPermission().iterator();

        ensureDirectoryExists(this.fs, outputFilePath.getParent(), ancestorOwnerAndPermissionIt);

        this.fileContext.rename(stagingFilePath, outputFilePath, renameOptions);
    } catch (IOException ioe) {
        log.error("Could not commit file %s.", outputFilePath);
        // persist file
        this.recoveryHelper.persistFile(this.state, copyableFile, stagingFilePath);
        throw ioe;
    } finally {
        try {
            this.fs.delete(this.stagingDir, true);
        } catch (IOException ioe) {
            log.warn("Failed to delete staging path at " + this.stagingDir);
        }
    }
}

From source file:org.apache.gobblin.data.management.copy.writer.TarArchiveInputStreamDataWriter.java

License:Apache License

/**
 * Untars the passed in {@link FileAwareInputStream} to the task's staging directory. Uses the name of the root
 * {@link TarArchiveEntry} in the stream as the directory name for the untarred file. The method also commits the data
 * by moving the file from staging to output directory.
 *
 * @see org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter#write(org.apache.gobblin.data.management.copy.FileAwareInputStream)
 *//*from  w w  w . j av  a2  s . c  om*/
@Override
public void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile,
        FileAwareInputStream record) throws IOException {
    this.closer.register(inputStream);

    TarArchiveInputStream tarIn = new TarArchiveInputStream(inputStream);
    final ReadableByteChannel inputChannel = Channels.newChannel(tarIn);
    TarArchiveEntry tarEntry;

    // flush the first entry in the tar, which is just the root directory
    tarEntry = tarIn.getNextTarEntry();
    String tarEntryRootName = StringUtils.remove(tarEntry.getName(), Path.SEPARATOR);

    log.info("Unarchiving at " + writeAt);

    try {
        while ((tarEntry = tarIn.getNextTarEntry()) != null) {

            // the API tarEntry.getName() is misleading, it is actually the path of the tarEntry in the tar file
            String newTarEntryPath = tarEntry.getName().replace(tarEntryRootName, writeAt.getName());
            Path tarEntryStagingPath = new Path(writeAt.getParent(), newTarEntryPath);
            if (!FileUtils.isSubPath(writeAt.getParent(), tarEntryStagingPath)) {
                throw new IOException(
                        String.format("Extracted file: %s is trying to write outside of output directory: %s",
                                tarEntryStagingPath, writeAt.getParent()));
            }

            if (tarEntry.isDirectory() && !this.fs.exists(tarEntryStagingPath)) {
                this.fs.mkdirs(tarEntryStagingPath);
            } else if (!tarEntry.isDirectory()) {
                FSDataOutputStream out = this.fs.create(tarEntryStagingPath, true);
                final WritableByteChannel outputChannel = Channels.newChannel(out);
                try {
                    StreamCopier copier = new StreamCopier(inputChannel, outputChannel);
                    if (isInstrumentationEnabled()) {
                        copier.withCopySpeedMeter(this.copySpeedMeter);
                    }
                    this.bytesWritten.addAndGet(copier.copy());
                    if (isInstrumentationEnabled()) {
                        log.info("File {}: copied {} bytes, average rate: {} B/s",
                                copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(),
                                this.copySpeedMeter.getMeanRate());
                    } else {
                        log.info("File {} copied.", copyableFile.getOrigin().getPath());
                    }
                } finally {
                    out.close();
                    outputChannel.close();
                }
            }
        }
    } finally {
        tarIn.close();
        inputChannel.close();
        inputStream.close();
    }
}

From source file:org.apache.gobblin.data.management.policy.HiddenFilterSelectionPolicy.java

License:Apache License

private boolean isPathHidden(Path path) {
    while (path != null) {
        String name = path.getName();
        for (String prefix : this.hiddenFilePrefixes) {
            if (name.startsWith(prefix)) {
                return true;
            }//  w w w  .  ja va2 s . c  om
        }
        path = path.getParent();
    }
    return false;
}

From source file:org.apache.gobblin.data.management.trash.Trash.java

License:Apache License

protected void ensureTrashLocationExists(FileSystem fs, Path trashLocation) throws IOException {
    if (fs.exists(trashLocation)) {
        if (!fs.isDirectory(trashLocation)) {
            throw new IOException(String.format("Trash location %s is not a directory.", trashLocation));
        }//w  w  w.  j  a v a 2 s  . c  o  m

        if (!fs.exists(new Path(trashLocation, TRASH_IDENTIFIER_FILE))) {
            // If trash identifier file is not present, directory might have been created by user.
            // Add trash identifier file only if directory is empty.
            if (fs.listStatus(trashLocation).length > 0) {
                throw new IOException(String.format(
                        "Trash directory %s exists, but it does not look like a trash directory. "
                                + "File: %s missing and directory is not empty.",
                        trashLocation, TRASH_IDENTIFIER_FILE));
            } else if (!fs.createNewFile(new Path(trashLocation, TRASH_IDENTIFIER_FILE))) {
                throw new IOException(String.format("Failed to create file %s in existing trash directory %s.",
                        TRASH_IDENTIFIER_FILE, trashLocation));
            }
        }
    } else if (!(safeFsMkdir(fs, trashLocation.getParent(), ALL_PERM) && safeFsMkdir(fs, trashLocation, PERM)
            && fs.createNewFile(new Path(trashLocation, TRASH_IDENTIFIER_FILE)))) {
        // Failed to create directory or create trash identifier file.
        throw new IOException("Failed to create trash directory at " + trashLocation.toString());
    }
}

From source file:org.apache.gobblin.hive.avro.HiveAvroSerDeManager.java

License:Apache License

/**
 * Add a {@link Schema} obtained from an Avro data file to the given {@link HiveRegistrationUnit}.
 *
 *  <p>//from  w  w w  . j a  v a 2s . com
 *    If the length of the schema is less than {@link #SCHEMA_LITERAL_LENGTH_LIMIT}, it will be added via
 *    {@link #SCHEMA_LITERAL}. Otherwise, the schema will be written to {@link #SCHEMA_FILE_NAME} and added
 *    via {@link #SCHEMA_URL}.
 *  </p>
 */
protected void addSchemaFromAvroFile(Schema schema, Path schemaFile, HiveRegistrationUnit hiveUnit)
        throws IOException {
    Preconditions.checkNotNull(schema);

    String schemaStr = schema.toString();
    if (schemaStr.length() <= this.schemaLiteralLengthLimit) {
        hiveUnit.setSerDeProp(SCHEMA_LITERAL, schema.toString());
    } else {
        Path schemaTempFile = null;

        if (useSchemaTempFile) {
            schemaTempFile = new Path(schemaFile.getParent(), this.schemaTempFileName);
        }

        AvroUtils.writeSchemaToFile(schema, schemaFile, schemaTempFile, this.fs, true);
        log.info("Using schema file " + schemaFile.toString());
        hiveUnit.setSerDeProp(SCHEMA_URL, schemaFile.toString());
    }
}

From source file:org.apache.gobblin.publisher.BaseDataPublisher.java

License:Apache License

protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData,
        Set<Path> writerOutputPathsMoved) throws IOException {
    // Get a ParallelRunner instance for moving files in parallel
    ParallelRunner parallelRunner = this.getParallelRunner(this.writerFileSystemByBranches.get(branchId));

    // The directory where the workUnitState wrote its output data.
    Path writerOutputDir = WriterUtils.getWriterOutputDir(state, this.numBranches, branchId);

    if (!this.writerFileSystemByBranches.get(branchId).exists(writerOutputDir)) {
        LOG.warn(String.format("Branch %d of WorkUnit %s produced no data", branchId, state.getId()));
        return;//from   ww  w.ja va 2s .  c  om
    }

    // The directory where the final output directory for this job will be placed.
    // It is a combination of DATA_PUBLISHER_FINAL_DIR and WRITER_FILE_PATH.
    Path publisherOutputDir = getPublisherOutputDir(state, branchId);

    if (publishSingleTaskData) {
        // Create final output directory
        WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId),
                publisherOutputDir, this.permissions.get(branchId), retrierConfig);
        addSingleTaskWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId,
                parallelRunner);
    } else {
        if (writerOutputPathsMoved.contains(writerOutputDir)) {
            // This writer output path has already been moved for another task of the same extract
            // If publishSingleTaskData=true, writerOutputPathMoved is ignored.
            return;
        }

        if (this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) {
            // The final output directory already exists, check if the job is configured to replace it.
            // If publishSingleTaskData=true, final output directory is never replaced.
            boolean replaceFinalOutputDir = this.getState()
                    .getPropAsBoolean(ForkOperatorUtils.getPropertyNameForBranch(
                            ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR, this.numBranches, branchId));

            // If the final output directory is not configured to be replaced, put new data to the existing directory.
            if (!replaceFinalOutputDir) {
                addWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId,
                        parallelRunner);
                writerOutputPathsMoved.add(writerOutputDir);
                return;
            }

            // Delete the final output directory if it is configured to be replaced
            LOG.info("Deleting publisher output dir " + publisherOutputDir);
            this.publisherFileSystemByBranches.get(branchId).delete(publisherOutputDir, true);
        } else {
            // Create the parent directory of the final output directory if it does not exist
            WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId),
                    publisherOutputDir.getParent(), this.permissions.get(branchId), retrierConfig);
        }

        movePath(parallelRunner, state, writerOutputDir, publisherOutputDir, branchId);
        writerOutputPathsMoved.add(writerOutputDir);
    }
}

From source file:org.apache.gobblin.publisher.BaseDataPublisher.java

License:Apache License

protected void addSingleTaskWriterOutputToExistingDir(Path writerOutputDir, Path publisherOutputDir,
        WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException {
    String outputFilePropName = ForkOperatorUtils.getPropertyNameForBranch(
            ConfigurationKeys.WRITER_FINAL_OUTPUT_FILE_PATHS, this.numBranches, branchId);

    if (!workUnitState.contains(outputFilePropName)) {
        LOG.warn("Missing property " + outputFilePropName + ". This task may have pulled no data.");
        return;/*from  w w  w .j  a  v  a 2s  .  c o m*/
    }

    Iterable<String> taskOutputFiles = workUnitState.getPropAsSet(outputFilePropName);
    for (String taskOutputFile : taskOutputFiles) {
        Path taskOutputPath = new Path(taskOutputFile);
        if (!this.writerFileSystemByBranches.get(branchId).exists(taskOutputPath)) {
            LOG.warn("Task output file " + taskOutputFile + " doesn't exist.");
            continue;
        }
        String pathSuffix = taskOutputFile.substring(
                taskOutputFile.indexOf(writerOutputDir.toString()) + writerOutputDir.toString().length() + 1);
        Path publisherOutputPath = new Path(publisherOutputDir, pathSuffix);
        WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId),
                publisherOutputPath.getParent(), this.permissions.get(branchId), retrierConfig);

        movePath(parallelRunner, workUnitState, taskOutputPath, publisherOutputPath, branchId);
    }
}

From source file:org.apache.gobblin.publisher.BaseDataPublisher.java

License:Apache License

/**
 * Publish metadata to a set of paths//from w  ww. j  a  va  2  s.c  o m
 */
private void publishMetadata(String metadataValue, int branchId, Path metadataOutputPath) throws IOException {
    try {
        if (metadataOutputPath == null) {
            LOG.info("Metadata output path not set for branch " + String.valueOf(branchId)
                    + ", not publishing.");
            return;
        }

        if (metadataValue == null) {
            LOG.info("No metadata collected for branch " + String.valueOf(branchId) + ", not publishing.");
            return;
        }

        FileSystem fs = this.metaDataWriterFileSystemByBranches.get(branchId);

        if (!fs.exists(metadataOutputPath.getParent())) {
            WriterUtils.mkdirsWithRecursivePermissionWithRetry(fs, metadataOutputPath,
                    this.permissions.get(branchId), retrierConfig);
        }

        //Delete the file if metadata already exists
        if (fs.exists(metadataOutputPath)) {
            HadoopUtils.deletePath(fs, metadataOutputPath, false);
        }
        LOG.info("Writing metadata for branch " + String.valueOf(branchId) + " to "
                + metadataOutputPath.toString());
        try (FSDataOutputStream outputStream = fs.create(metadataOutputPath)) {
            outputStream.write(metadataValue.getBytes(StandardCharsets.UTF_8));
        }
    } catch (IOException e) {
        LOG.error("Metadata file is not generated: " + e, e);
    }
}