List of usage examples for org.apache.hadoop.fs Path getParent
public Path getParent()
From source file:org.apache.gobblin.data.management.copy.CopyableFile.java
License:Apache License
/** * Set file system based source and destination dataset for this {@link CopyableFile} * * @param originFs {@link FileSystem} where this {@link CopyableFile} origins * @param targetFs {@link FileSystem} where this {@link CopyableFile} is copied to *//*from ww w .j a va2 s . c o m*/ public void setFsDatasets(FileSystem originFs, FileSystem targetFs) { /* * By default, the raw Gobblin dataset for CopyableFile lineage is its parent folder * if itself is not a folder */ boolean isDir = origin.isDirectory(); Path fullSourcePath = Path.getPathWithoutSchemeAndAuthority(origin.getPath()); String sourceDatasetName = isDir ? fullSourcePath.toString() : fullSourcePath.getParent().toString(); DatasetDescriptor sourceDataset = new DatasetDescriptor(originFs.getScheme(), sourceDatasetName); sourceDataset.addMetadata(DatasetConstants.FS_URI, originFs.getUri().toString()); sourceData = sourceDataset; Path fullDestinationPath = Path.getPathWithoutSchemeAndAuthority(destination); String destinationDatasetName = isDir ? fullDestinationPath.toString() : fullDestinationPath.getParent().toString(); DatasetDescriptor destinationDataset = new DatasetDescriptor(targetFs.getScheme(), destinationDatasetName); destinationDataset.addMetadata(DatasetConstants.FS_URI, targetFs.getUri().toString()); destinationData = destinationDataset; }
From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter.java
License:Apache License
@Override public final void writeImpl(FileAwareInputStream fileAwareInputStream) throws IOException { CopyableFile copyableFile = fileAwareInputStream.getFile(); if (encryptionConfig != null) { copyableFile.setDestination(PathUtils.addExtension(copyableFile.getDestination(), "." + EncryptionConfigParser.getEncryptionType(encryptionConfig))); }//w ww.j a va 2s.co m Path stagingFile = getStagingFilePath(copyableFile); if (this.actualProcessedCopyableFile.isPresent()) { throw new IOException(this.getClass().getCanonicalName() + " can only process one file."); } this.actualProcessedCopyableFile = Optional.of(copyableFile); this.fs.mkdirs(stagingFile.getParent()); writeImpl(fileAwareInputStream.getInputStream(), stagingFile, copyableFile, fileAwareInputStream); this.filesWritten.incrementAndGet(); }
From source file:org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter.java
License:Apache License
/** * Moves the file from task staging to task output. Each task has its own staging directory but all the tasks share * the same task output directory.//w w w .ja va 2 s . com * * {@inheritDoc} * * @see DataWriter#commit() */ @Override public void commit() throws IOException { if (!this.actualProcessedCopyableFile.isPresent()) { return; } CopyableFile copyableFile = this.actualProcessedCopyableFile.get(); Path stagingFilePath = getStagingFilePath(copyableFile); Path outputFilePath = getSplitOutputFilePath(copyableFile, this.outputDir, copyableFile.getDatasetAndPartition(this.copyableDatasetMetadata), this.state); log.info(String.format("Committing data from %s to %s", stagingFilePath, outputFilePath)); try { setFilePermissions(copyableFile); Iterator<OwnerAndPermission> ancestorOwnerAndPermissionIt = copyableFile .getAncestorsOwnerAndPermission() == null ? Iterators.<OwnerAndPermission>emptyIterator() : copyableFile.getAncestorsOwnerAndPermission().iterator(); ensureDirectoryExists(this.fs, outputFilePath.getParent(), ancestorOwnerAndPermissionIt); this.fileContext.rename(stagingFilePath, outputFilePath, renameOptions); } catch (IOException ioe) { log.error("Could not commit file %s.", outputFilePath); // persist file this.recoveryHelper.persistFile(this.state, copyableFile, stagingFilePath); throw ioe; } finally { try { this.fs.delete(this.stagingDir, true); } catch (IOException ioe) { log.warn("Failed to delete staging path at " + this.stagingDir); } } }
From source file:org.apache.gobblin.data.management.copy.writer.TarArchiveInputStreamDataWriter.java
License:Apache License
/** * Untars the passed in {@link FileAwareInputStream} to the task's staging directory. Uses the name of the root * {@link TarArchiveEntry} in the stream as the directory name for the untarred file. The method also commits the data * by moving the file from staging to output directory. * * @see org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter#write(org.apache.gobblin.data.management.copy.FileAwareInputStream) *//*from w w w . j av a2 s . c om*/ @Override public void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile, FileAwareInputStream record) throws IOException { this.closer.register(inputStream); TarArchiveInputStream tarIn = new TarArchiveInputStream(inputStream); final ReadableByteChannel inputChannel = Channels.newChannel(tarIn); TarArchiveEntry tarEntry; // flush the first entry in the tar, which is just the root directory tarEntry = tarIn.getNextTarEntry(); String tarEntryRootName = StringUtils.remove(tarEntry.getName(), Path.SEPARATOR); log.info("Unarchiving at " + writeAt); try { while ((tarEntry = tarIn.getNextTarEntry()) != null) { // the API tarEntry.getName() is misleading, it is actually the path of the tarEntry in the tar file String newTarEntryPath = tarEntry.getName().replace(tarEntryRootName, writeAt.getName()); Path tarEntryStagingPath = new Path(writeAt.getParent(), newTarEntryPath); if (!FileUtils.isSubPath(writeAt.getParent(), tarEntryStagingPath)) { throw new IOException( String.format("Extracted file: %s is trying to write outside of output directory: %s", tarEntryStagingPath, writeAt.getParent())); } if (tarEntry.isDirectory() && !this.fs.exists(tarEntryStagingPath)) { this.fs.mkdirs(tarEntryStagingPath); } else if (!tarEntry.isDirectory()) { FSDataOutputStream out = this.fs.create(tarEntryStagingPath, true); final WritableByteChannel outputChannel = Channels.newChannel(out); try { StreamCopier copier = new StreamCopier(inputChannel, outputChannel); if (isInstrumentationEnabled()) { copier.withCopySpeedMeter(this.copySpeedMeter); } this.bytesWritten.addAndGet(copier.copy()); if (isInstrumentationEnabled()) { log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate()); } else { log.info("File {} copied.", copyableFile.getOrigin().getPath()); } } finally { out.close(); outputChannel.close(); } } } } finally { tarIn.close(); inputChannel.close(); inputStream.close(); } }
From source file:org.apache.gobblin.data.management.policy.HiddenFilterSelectionPolicy.java
License:Apache License
private boolean isPathHidden(Path path) { while (path != null) { String name = path.getName(); for (String prefix : this.hiddenFilePrefixes) { if (name.startsWith(prefix)) { return true; }// w w w . ja va2 s . c om } path = path.getParent(); } return false; }
From source file:org.apache.gobblin.data.management.trash.Trash.java
License:Apache License
protected void ensureTrashLocationExists(FileSystem fs, Path trashLocation) throws IOException { if (fs.exists(trashLocation)) { if (!fs.isDirectory(trashLocation)) { throw new IOException(String.format("Trash location %s is not a directory.", trashLocation)); }//w w w. j a v a 2 s . c o m if (!fs.exists(new Path(trashLocation, TRASH_IDENTIFIER_FILE))) { // If trash identifier file is not present, directory might have been created by user. // Add trash identifier file only if directory is empty. if (fs.listStatus(trashLocation).length > 0) { throw new IOException(String.format( "Trash directory %s exists, but it does not look like a trash directory. " + "File: %s missing and directory is not empty.", trashLocation, TRASH_IDENTIFIER_FILE)); } else if (!fs.createNewFile(new Path(trashLocation, TRASH_IDENTIFIER_FILE))) { throw new IOException(String.format("Failed to create file %s in existing trash directory %s.", TRASH_IDENTIFIER_FILE, trashLocation)); } } } else if (!(safeFsMkdir(fs, trashLocation.getParent(), ALL_PERM) && safeFsMkdir(fs, trashLocation, PERM) && fs.createNewFile(new Path(trashLocation, TRASH_IDENTIFIER_FILE)))) { // Failed to create directory or create trash identifier file. throw new IOException("Failed to create trash directory at " + trashLocation.toString()); } }
From source file:org.apache.gobblin.hive.avro.HiveAvroSerDeManager.java
License:Apache License
/** * Add a {@link Schema} obtained from an Avro data file to the given {@link HiveRegistrationUnit}. * * <p>//from w w w . j a v a 2s . com * If the length of the schema is less than {@link #SCHEMA_LITERAL_LENGTH_LIMIT}, it will be added via * {@link #SCHEMA_LITERAL}. Otherwise, the schema will be written to {@link #SCHEMA_FILE_NAME} and added * via {@link #SCHEMA_URL}. * </p> */ protected void addSchemaFromAvroFile(Schema schema, Path schemaFile, HiveRegistrationUnit hiveUnit) throws IOException { Preconditions.checkNotNull(schema); String schemaStr = schema.toString(); if (schemaStr.length() <= this.schemaLiteralLengthLimit) { hiveUnit.setSerDeProp(SCHEMA_LITERAL, schema.toString()); } else { Path schemaTempFile = null; if (useSchemaTempFile) { schemaTempFile = new Path(schemaFile.getParent(), this.schemaTempFileName); } AvroUtils.writeSchemaToFile(schema, schemaFile, schemaTempFile, this.fs, true); log.info("Using schema file " + schemaFile.toString()); hiveUnit.setSerDeProp(SCHEMA_URL, schemaFile.toString()); } }
From source file:org.apache.gobblin.publisher.BaseDataPublisher.java
License:Apache License
protected void publishData(WorkUnitState state, int branchId, boolean publishSingleTaskData, Set<Path> writerOutputPathsMoved) throws IOException { // Get a ParallelRunner instance for moving files in parallel ParallelRunner parallelRunner = this.getParallelRunner(this.writerFileSystemByBranches.get(branchId)); // The directory where the workUnitState wrote its output data. Path writerOutputDir = WriterUtils.getWriterOutputDir(state, this.numBranches, branchId); if (!this.writerFileSystemByBranches.get(branchId).exists(writerOutputDir)) { LOG.warn(String.format("Branch %d of WorkUnit %s produced no data", branchId, state.getId())); return;//from ww w.ja va 2s . c om } // The directory where the final output directory for this job will be placed. // It is a combination of DATA_PUBLISHER_FINAL_DIR and WRITER_FILE_PATH. Path publisherOutputDir = getPublisherOutputDir(state, branchId); if (publishSingleTaskData) { // Create final output directory WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir, this.permissions.get(branchId), retrierConfig); addSingleTaskWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); } else { if (writerOutputPathsMoved.contains(writerOutputDir)) { // This writer output path has already been moved for another task of the same extract // If publishSingleTaskData=true, writerOutputPathMoved is ignored. return; } if (this.publisherFileSystemByBranches.get(branchId).exists(publisherOutputDir)) { // The final output directory already exists, check if the job is configured to replace it. // If publishSingleTaskData=true, final output directory is never replaced. boolean replaceFinalOutputDir = this.getState() .getPropAsBoolean(ForkOperatorUtils.getPropertyNameForBranch( ConfigurationKeys.DATA_PUBLISHER_REPLACE_FINAL_DIR, this.numBranches, branchId)); // If the final output directory is not configured to be replaced, put new data to the existing directory. if (!replaceFinalOutputDir) { addWriterOutputToExistingDir(writerOutputDir, publisherOutputDir, state, branchId, parallelRunner); writerOutputPathsMoved.add(writerOutputDir); return; } // Delete the final output directory if it is configured to be replaced LOG.info("Deleting publisher output dir " + publisherOutputDir); this.publisherFileSystemByBranches.get(branchId).delete(publisherOutputDir, true); } else { // Create the parent directory of the final output directory if it does not exist WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputDir.getParent(), this.permissions.get(branchId), retrierConfig); } movePath(parallelRunner, state, writerOutputDir, publisherOutputDir, branchId); writerOutputPathsMoved.add(writerOutputDir); } }
From source file:org.apache.gobblin.publisher.BaseDataPublisher.java
License:Apache License
protected void addSingleTaskWriterOutputToExistingDir(Path writerOutputDir, Path publisherOutputDir, WorkUnitState workUnitState, int branchId, ParallelRunner parallelRunner) throws IOException { String outputFilePropName = ForkOperatorUtils.getPropertyNameForBranch( ConfigurationKeys.WRITER_FINAL_OUTPUT_FILE_PATHS, this.numBranches, branchId); if (!workUnitState.contains(outputFilePropName)) { LOG.warn("Missing property " + outputFilePropName + ". This task may have pulled no data."); return;/*from w w w .j a v a 2s . c o m*/ } Iterable<String> taskOutputFiles = workUnitState.getPropAsSet(outputFilePropName); for (String taskOutputFile : taskOutputFiles) { Path taskOutputPath = new Path(taskOutputFile); if (!this.writerFileSystemByBranches.get(branchId).exists(taskOutputPath)) { LOG.warn("Task output file " + taskOutputFile + " doesn't exist."); continue; } String pathSuffix = taskOutputFile.substring( taskOutputFile.indexOf(writerOutputDir.toString()) + writerOutputDir.toString().length() + 1); Path publisherOutputPath = new Path(publisherOutputDir, pathSuffix); WriterUtils.mkdirsWithRecursivePermissionWithRetry(this.publisherFileSystemByBranches.get(branchId), publisherOutputPath.getParent(), this.permissions.get(branchId), retrierConfig); movePath(parallelRunner, workUnitState, taskOutputPath, publisherOutputPath, branchId); } }
From source file:org.apache.gobblin.publisher.BaseDataPublisher.java
License:Apache License
/** * Publish metadata to a set of paths//from w ww. j a va 2 s.c o m */ private void publishMetadata(String metadataValue, int branchId, Path metadataOutputPath) throws IOException { try { if (metadataOutputPath == null) { LOG.info("Metadata output path not set for branch " + String.valueOf(branchId) + ", not publishing."); return; } if (metadataValue == null) { LOG.info("No metadata collected for branch " + String.valueOf(branchId) + ", not publishing."); return; } FileSystem fs = this.metaDataWriterFileSystemByBranches.get(branchId); if (!fs.exists(metadataOutputPath.getParent())) { WriterUtils.mkdirsWithRecursivePermissionWithRetry(fs, metadataOutputPath, this.permissions.get(branchId), retrierConfig); } //Delete the file if metadata already exists if (fs.exists(metadataOutputPath)) { HadoopUtils.deletePath(fs, metadataOutputPath, false); } LOG.info("Writing metadata for branch " + String.valueOf(branchId) + " to " + metadataOutputPath.toString()); try (FSDataOutputStream outputStream = fs.create(metadataOutputPath)) { outputStream.write(metadataValue.getBytes(StandardCharsets.UTF_8)); } } catch (IOException e) { LOG.error("Metadata file is not generated: " + e, e); } }