List of usage examples for org.apache.hadoop.fs Path SEPARATOR
String SEPARATOR
To view the source code for org.apache.hadoop.fs Path SEPARATOR.
Click Source Link
From source file:org.apache.falcon.snapshots.replication.HdfsSnapshotReplicator.java
License:Apache License
private String getSnapshotDir(String dirName) { dirName = StringUtils.removeEnd(dirName, Path.SEPARATOR); return dirName + Path.SEPARATOR + HdfsSnapshotUtil.SNAPSHOT_DIR_PREFIX + Path.SEPARATOR; }
From source file:org.apache.falcon.snapshots.retention.HdfsSnapshotEvictor.java
License:Apache License
protected static void evictSnapshots(DistributedFileSystem fs, String dirName, String ageLimit, int numSnapshots) throws FalconException { try {/*from w w w . ja va2s .c om*/ LOG.info("Started evicting snapshots on dir {}{} using policy {}, agelimit {}, numSnapshot {}", fs.getUri(), dirName, ageLimit, numSnapshots); long evictionTime = System.currentTimeMillis() - EvictionHelper.evalExpressionToMilliSeconds(ageLimit); dirName = StringUtils.removeEnd(dirName, Path.SEPARATOR); String snapshotDir = dirName + Path.SEPARATOR + HdfsSnapshotUtil.SNAPSHOT_DIR_PREFIX + Path.SEPARATOR; FileStatus[] snapshots = fs.listStatus(new Path(snapshotDir)); if (snapshots.length <= numSnapshots) { // no eviction needed return; } // Sort by last modified time, ascending order. Arrays.sort(snapshots, new Comparator<FileStatus>() { @Override public int compare(FileStatus f1, FileStatus f2) { return Long.compare(f1.getModificationTime(), f2.getModificationTime()); } }); for (int i = 0; i < (snapshots.length - numSnapshots); i++) { // delete if older than ageLimit while retaining numSnapshots if (snapshots[i].getModificationTime() < evictionTime) { fs.deleteSnapshot(new Path(dirName), snapshots[i].getPath().getName()); } } } catch (ELException ele) { LOG.warn("Unable to parse retention age limit {} {}", ageLimit, ele.getMessage()); throw new FalconException("Unable to parse retention age limit " + ageLimit, ele); } catch (IOException ioe) { LOG.warn("Unable to evict snapshots from dir {} {}", dirName, ioe); throw new FalconException("Unable to evict snapshots from dir " + dirName, ioe); } }
From source file:org.apache.flume.sink.hive.TestUtil.java
License:Apache License
public static void createDbAndTable(Driver driver, String databaseName, String tableName, List<String> partVals, String[] colNames, String[] colTypes, String[] partNames, String dbLocation) throws Exception { String dbUri = "raw://" + dbLocation; String tableLoc = dbUri + Path.SEPARATOR + tableName; runDDL(driver, "create database IF NOT EXISTS " + databaseName + " location '" + dbUri + "'"); runDDL(driver, "use " + databaseName); String crtTbl = "create table " + tableName + " ( " + getTableColumnsStr(colNames, colTypes) + " )" + getPartitionStmtStr(partNames) + " clustered by ( " + colNames[0] + " )" + " into 10 buckets " + " stored as orc " + " location '" + tableLoc + "'" + " TBLPROPERTIES ('transactional'='true')"; runDDL(driver, crtTbl);//from w w w . j a v a 2s. c o m System.out.println("crtTbl = " + crtTbl); if (partNames != null && partNames.length != 0) { String addPart = "alter table " + tableName + " add partition ( " + getTablePartsStr2(partNames, partVals) + " )"; runDDL(driver, addPart); } }
From source file:org.apache.gobblin.cluster.GobblinClusterUtils.java
License:Apache License
/** * Generate the path to the job.state file * @param usingStateStore is a state store being used to store the job.state content * @param appWorkPath work directory/*www .jav a 2 s. co m*/ * @param jobId job id * @return a {@link Path} referring to the job.state */ public static Path getJobStateFilePath(boolean usingStateStore, Path appWorkPath, String jobId) { final Path jobStateFilePath; // the state store uses a path of the form workdir/_jobstate/job_id/job_id.job.state while old method stores the file // in the app work dir. if (usingStateStore) { jobStateFilePath = new Path(appWorkPath, GobblinClusterConfigurationKeys.JOB_STATE_DIR_NAME + Path.SEPARATOR + jobId + Path.SEPARATOR + jobId + "." + AbstractJobLauncher.JOB_STATE_FILE_NAME); } else { jobStateFilePath = new Path(appWorkPath, jobId + "." + AbstractJobLauncher.JOB_STATE_FILE_NAME); } log.info("job state file path: " + jobStateFilePath); return jobStateFilePath; }
From source file:org.apache.gobblin.cluster.GobblinHelixJobLauncher.java
License:Apache License
public GobblinHelixJobLauncher(Properties jobProps, final HelixManager helixManager, Path appWorkDir, List<? extends Tag<?>> metadataTags, ConcurrentHashMap<String, Boolean> runningMap, Optional<GobblinHelixMetrics> helixMetrics) throws Exception { super(jobProps, addAdditionalMetadataTags(jobProps, metadataTags)); LOGGER.debug("GobblinHelixJobLauncher: jobProps {}, appWorkDir {}", jobProps, appWorkDir); this.helixManager = helixManager; this.helixTaskDriver = new TaskDriver(this.helixManager); this.runningMap = runningMap; this.appWorkDir = appWorkDir; this.inputWorkUnitDir = new Path(appWorkDir, GobblinClusterConfigurationKeys.INPUT_WORK_UNIT_DIR_NAME); this.outputTaskStateDir = new Path(this.appWorkDir, GobblinClusterConfigurationKeys.OUTPUT_TASK_STATE_DIR_NAME + Path.SEPARATOR + this.jobContext.getJobId()); this.helixWorkFlowName = this.jobContext.getJobId(); this.jobContext.getJobState().setJobLauncherType(LauncherTypeEnum.CLUSTER); this.stateSerDeRunnerThreads = Integer .parseInt(jobProps.getProperty(ParallelRunner.PARALLEL_RUNNER_THREADS_KEY, Integer.toString(ParallelRunner.DEFAULT_PARALLEL_RUNNER_THREADS))); jobConfig = ConfigUtils.propertiesToConfig(jobProps); this.workFlowExpiryTimeSeconds = ConfigUtils.getLong(jobConfig, GobblinClusterConfigurationKeys.HELIX_WORKFLOW_EXPIRY_TIME_SECONDS, GobblinClusterConfigurationKeys.DEFAULT_HELIX_WORKFLOW_EXPIRY_TIME_SECONDS); this.helixJobStopTimeoutSeconds = ConfigUtils.getLong(jobConfig, GobblinClusterConfigurationKeys.HELIX_JOB_STOP_TIMEOUT_SECONDS, GobblinClusterConfigurationKeys.DEFAULT_HELIX_JOB_STOP_TIMEOUT_SECONDS); Config stateStoreJobConfig = ConfigUtils.propertiesToConfig(jobProps).withValue( ConfigurationKeys.STATE_STORE_FS_URI_KEY, ConfigValueFactory.fromAnyRef(new URI(appWorkDir.toUri().getScheme(), null, appWorkDir.toUri().getHost(), appWorkDir.toUri().getPort(), null, null, null).toString())); this.stateStores = new StateStores(stateStoreJobConfig, appWorkDir, GobblinClusterConfigurationKeys.OUTPUT_TASK_STATE_DIR_NAME, appWorkDir, GobblinClusterConfigurationKeys.INPUT_WORK_UNIT_DIR_NAME, appWorkDir, GobblinClusterConfigurationKeys.JOB_STATE_DIR_NAME); URI fsUri = URI.create(jobProps.getProperty(ConfigurationKeys.FS_URI_KEY, ConfigurationKeys.LOCAL_FS_URI)); this.fs = FileSystem.get(fsUri, new Configuration()); this.taskStateCollectorService = new TaskStateCollectorService(jobProps, this.jobContext.getJobState(), this.eventBus, this.stateStores.getTaskStateStore(), this.outputTaskStateDir); this.helixMetrics = helixMetrics; startCancellationExecutor();/* w w w . j a v a 2s. c o m*/ }
From source file:org.apache.gobblin.compaction.mapreduce.CompactionJobConfigurator.java
License:Apache License
private static boolean isFailedPath(Path path, List<TaskCompletionEvent> failedEvents) { return path.toString().contains("_temporary") || failedEvents.stream().anyMatch(event -> path.toString() .contains(Path.SEPARATOR + event.getTaskAttemptId().toString() + Path.SEPARATOR)); }
From source file:org.apache.gobblin.data.management.conversion.hive.converter.AbstractAvroToOrcConverter.java
License:Apache License
/** * Populate the avro to orc conversion queries. The Queries will be added to {@link QueryBasedHiveConversionEntity#getQueries()} *///from w w w . j a v a2s . c om @Override public Iterable<QueryBasedHiveConversionEntity> convertRecord(Schema outputAvroSchema, QueryBasedHiveConversionEntity conversionEntity, WorkUnitState workUnit) throws DataConversionException { Preconditions.checkNotNull(outputAvroSchema, "Avro schema must not be null"); Preconditions.checkNotNull(conversionEntity, "Conversion entity must not be null"); Preconditions.checkNotNull(workUnit, "Workunit state must not be null"); Preconditions.checkNotNull(conversionEntity.getTable(), "Hive table within conversion entity must not be null"); EventWorkunitUtils.setBeginDDLBuildTimeMetadata(workUnit, System.currentTimeMillis()); this.hiveDataset = conversionEntity.getConvertibleHiveDataset(); if (!hasConversionConfig()) { return new SingleRecordIterable<>(conversionEntity); } // Avro table name and location String avroTableName = conversionEntity.getTable().getTableName(); // ORC table name and location String orcTableName = getConversionConfig().getDestinationTableName(); String orcStagingTableName = getOrcStagingTableName(getConversionConfig().getDestinationStagingTableName()); String orcTableDatabase = getConversionConfig().getDestinationDbName(); String orcDataLocation = getOrcDataLocation(); String orcStagingDataLocation = getOrcStagingDataLocation(orcStagingTableName); boolean isEvolutionEnabled = getConversionConfig().isEvolutionEnabled(); Pair<Optional<Table>, Optional<List<Partition>>> destinationMeta = HiveConverterUtils .getDestinationTableMeta(orcTableDatabase, orcTableName, workUnit.getProperties()); Optional<Table> destinationTableMeta = destinationMeta.getLeft(); // Optional // View registration blacklist / whitelist Optional<WhitelistBlacklist> optionalViewRegistrationWhiteBlacklist = getViewWhiteBackListFromWorkUnit( workUnit); // wrapperViewName : If specified view with 'wrapperViewName' is created if not already exists // over destination table // isUpdateViewAlwaysEnabled: If false 'wrapperViewName' is only updated when schema evolves; if true // 'wrapperViewName' is always updated (everytime publish happens) Optional<String> wrapperViewName = Optional.<String>absent(); if (optionalViewRegistrationWhiteBlacklist.isPresent()) { wrapperViewName = optionalViewRegistrationWhiteBlacklist.get().acceptTable(orcTableDatabase, orcTableName) ? getConversionConfig().getDestinationViewName() : wrapperViewName; } else { wrapperViewName = getConversionConfig().getDestinationViewName(); } boolean shouldUpdateView = getConversionConfig().isUpdateViewAlwaysEnabled(); // Other properties Optional<List<String>> clusterBy = getConversionConfig().getClusterBy().isEmpty() ? Optional.<List<String>>absent() : Optional.of(getConversionConfig().getClusterBy()); Optional<Integer> numBuckets = getConversionConfig().getNumBuckets(); Optional<Integer> rowLimit = getConversionConfig().getRowLimit(); Properties tableProperties = getConversionConfig().getDestinationTableProperties(); // Partition dir hint helps create different directory for hourly and daily partition with same timestamp, such as: // .. daily_2016-01-01-00 and hourly_2016-01-01-00 // This helps existing hourly data from not being deleted at the time of roll up, and so Hive queries in flight // .. do not fail List<String> sourceDataPathIdentifier = getConversionConfig().getSourceDataPathIdentifier(); // Populate optional partition info Map<String, String> partitionsDDLInfo = Maps.newHashMap(); Map<String, String> partitionsDMLInfo = Maps.newHashMap(); HiveConverterUtils.populatePartitionInfo(conversionEntity, partitionsDDLInfo, partitionsDMLInfo); /* * Create ORC data location with the same permissions as Avro data * * Note that hive can also automatically create the non-existing directories but it does not * seem to create it with the desired permissions. * According to hive docs permissions for newly created directories/files can be controlled using uMask like, * * SET hive.warehouse.subdir.inherit.perms=false; * SET fs.permissions.umask-mode=022; * Upon testing, this did not work */ try { FileStatus sourceDataFileStatus = this.fs.getFileStatus(conversionEntity.getTable().getDataLocation()); FsPermission sourceDataPermission = sourceDataFileStatus.getPermission(); if (!this.fs.mkdirs(new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission)) { throw new RuntimeException(String.format("Failed to create path %s with permissions %s", new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission)); } else { this.fs.setPermission(new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission); // Explicitly set group name for destination location if specified otherwise preserve source group name String destinationGroupName; if (workUnit.contains(HIVE_DATASET_DESTINATION_GROUP_NAME)) { destinationGroupName = workUnit.getProp(HIVE_DATASET_DESTINATION_GROUP_NAME); } else { destinationGroupName = sourceDataFileStatus.getGroup(); } if (!workUnit.getPropAsBoolean(HIVE_DATASET_DESTINATION_SKIP_SETGROUP, DEFAULT_HIVE_DATASET_DESTINATION_SKIP_SETGROUP)) { this.fs.setOwner(new Path(getConversionConfig().getDestinationDataPath()), null, destinationGroupName); } log.info(String.format("Created %s with permissions %s and group %s", new Path(getConversionConfig().getDestinationDataPath()), sourceDataPermission, sourceDataFileStatus.getGroup())); // Explicitly set group name for staging directory if specified if (workUnit.contains(HIVE_DATASET_STAGING_GROUP_NAME)) { String stagingGroupName = workUnit.getProp(HIVE_DATASET_STAGING_GROUP_NAME); log.info("Setting staging directory group name as " + stagingGroupName); this.fs.mkdirs(new Path(getOrcStagingDataLocation(orcStagingTableName))); this.fs.setOwner(new Path(getOrcStagingDataLocation(orcStagingTableName)), null, stagingGroupName); // Staging directory will be renamed to getOrcDataLocation() and hence it's group name should match // with the group name of the staging directory this.fs.mkdirs(new Path(getOrcDataLocation())); this.fs.setOwner(new Path(getOrcDataLocation()), null, stagingGroupName); } } } catch (IOException e) { Throwables.propagate(e); } // Set hive runtime properties from conversion config for (Map.Entry<Object, Object> entry : getConversionConfig().getHiveRuntimeProperties().entrySet()) { conversionEntity.getQueries().add(String.format("SET %s=%s", entry.getKey(), entry.getValue())); } // Set hive runtime properties for tracking conversionEntity.getQueries().add( String.format("SET %s=%s", GOBBLIN_DATASET_URN_KEY, conversionEntity.getTable().getCompleteName())); if (conversionEntity.getPartition().isPresent()) { conversionEntity.getQueries().add(String.format("SET %s=%s", GOBBLIN_PARTITION_NAME_KEY, conversionEntity.getPartition().get().getCompleteName())); } conversionEntity.getQueries().add(String.format("SET %s=%s", GOBBLIN_WORKUNIT_CREATE_TIME_KEY, workUnit.getWorkunit().getProp(SlaEventKeys.ORIGIN_TS_IN_MILLI_SECS_KEY))); // Create DDL statement for table Map<String, String> hiveColumns = new LinkedHashMap<>(); String createStagingTableDDL = HiveAvroORCQueryGenerator.generateCreateTableDDL(outputAvroSchema, orcStagingTableName, orcStagingDataLocation, Optional.of(orcTableDatabase), Optional.of(partitionsDDLInfo), clusterBy, Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), numBuckets, Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(), tableProperties, isEvolutionEnabled, destinationTableMeta, hiveColumns); conversionEntity.getQueries().add(createStagingTableDDL); log.debug("Create staging table DDL: " + createStagingTableDDL); // Create DDL statement for partition String orcStagingDataPartitionDirName = HiveConverterUtils.getStagingDataPartitionDirName(conversionEntity, sourceDataPathIdentifier); String orcStagingDataPartitionLocation = orcStagingDataLocation + Path.SEPARATOR + orcStagingDataPartitionDirName; if (partitionsDMLInfo.size() > 0) { List<String> createStagingPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL( orcTableDatabase, orcStagingTableName, orcStagingDataPartitionLocation, partitionsDMLInfo); conversionEntity.getQueries().addAll(createStagingPartitionDDL); log.debug("Create staging partition DDL: " + createStagingPartitionDDL); } // Create DML statement String insertInORCStagingTableDML = HiveAvroORCQueryGenerator.generateTableMappingDML( conversionEntity.getHiveTable().getAvroSchema(), outputAvroSchema, avroTableName, orcStagingTableName, Optional.of(conversionEntity.getTable().getDbName()), Optional.of(orcTableDatabase), Optional.of(partitionsDMLInfo), Optional.<Boolean>absent(), Optional.<Boolean>absent(), isEvolutionEnabled, destinationTableMeta, rowLimit); conversionEntity.getQueries().add(insertInORCStagingTableDML); log.debug("Conversion staging DML: " + insertInORCStagingTableDML); // TODO: Split this method into two (conversion and publish) // Addition to WUS for Staging publish: // A. Evolution turned on: // 1. If table does not exists: simply create it (now it should exist) // 2. If table exists: // 2.1 Evolve table (alter table) // 2.2 If snapshot table: // 2.2.1 Delete data in final table directory // 2.2.2 Move data from staging to final table directory // 2.2.3 Drop this staging table and delete directories // 2.3 If partitioned table, move partitions from staging to final table; for all partitions: // 2.3.1 Drop if exists partition in final table // 2.3.2 Move partition directory // 2.3.3 Create partition with location // 2.3.4 Drop this staging table and delete directories // B. Evolution turned off: // 1. If table does not exists: simply create it (now it should exist) // 2. If table exists: // 2.1 Do not evolve table // 2.2 If snapshot table: // 2.2.1 Delete data in final table directory // 2.2.2 Move data from staging to final table directory // 2.2.3 Drop this staging table and delete directories // 2.3 If partitioned table, move partitions from staging to final table; for all partitions: // 2.3.1 Drop if exists partition in final table // 2.3.2 Move partition directory // 2.3.3 Create partition with location // 2.3.4 Drop this staging table and delete directories // Note: The queries below also serve as compatibility check module before conversion, an incompatible // .. schema throws a Runtime exeption, hence preventing further execution QueryBasedHivePublishEntity publishEntity = new QueryBasedHivePublishEntity(); List<String> publishQueries = publishEntity.getPublishQueries(); Map<String, String> publishDirectories = publishEntity.getPublishDirectories(); List<String> cleanupQueries = publishEntity.getCleanupQueries(); List<String> cleanupDirectories = publishEntity.getCleanupDirectories(); // Step: // A.1, B.1: If table does not exists, simply create it if (!destinationTableMeta.isPresent()) { String createTargetTableDDL = HiveAvroORCQueryGenerator.generateCreateTableDDL(outputAvroSchema, orcTableName, orcDataLocation, Optional.of(orcTableDatabase), Optional.of(partitionsDDLInfo), clusterBy, Optional.<Map<String, HiveAvroORCQueryGenerator.COLUMN_SORT_ORDER>>absent(), numBuckets, Optional.<String>absent(), Optional.<String>absent(), Optional.<String>absent(), tableProperties, isEvolutionEnabled, destinationTableMeta, new HashMap<String, String>()); publishQueries.add(createTargetTableDDL); log.debug("Create final table DDL: " + createTargetTableDDL); } // Step: // A.2.1: If table pre-exists (destinationTableMeta would be present), evolve table // B.2.1: No-op List<String> evolutionDDLs = HiveAvroORCQueryGenerator.generateEvolutionDDL(orcStagingTableName, orcTableName, Optional.of(orcTableDatabase), Optional.of(orcTableDatabase), outputAvroSchema, isEvolutionEnabled, hiveColumns, destinationTableMeta); log.debug("Evolve final table DDLs: " + evolutionDDLs); EventWorkunitUtils.setEvolutionMetadata(workUnit, evolutionDDLs); // View (if present) must be updated if evolution happens shouldUpdateView |= evolutionDDLs.size() > 0; publishQueries.addAll(evolutionDDLs); if (partitionsDDLInfo.size() == 0) { // Step: // A.2.2, B.2.2: Snapshot table // Step: // A.2.2.1, B.2.2.1: Delete data in final table directory // A.2.2.2, B.2.2.2: Move data from staging to final table directory log.info("Snapshot directory to move: " + orcStagingDataLocation + " to: " + orcDataLocation); publishDirectories.put(orcStagingDataLocation, orcDataLocation); // Step: // A.2.2.3, B.2.2.3: Drop this staging table and delete directories String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(orcTableDatabase, orcStagingTableName); log.debug("Drop staging table DDL: " + dropStagingTableDDL); cleanupQueries.add(dropStagingTableDDL); // Delete: orcStagingDataLocation log.info("Staging table directory to delete: " + orcStagingDataLocation); cleanupDirectories.add(orcStagingDataLocation); } else { // Step: // A.2.3, B.2.3: If partitioned table, move partitions from staging to final table; for all partitions: // Step: // A.2.3.2, B.2.3.2: Move partition directory // Move: orcStagingDataPartitionLocation to: orcFinalDataPartitionLocation String orcFinalDataPartitionLocation = orcDataLocation + Path.SEPARATOR + orcStagingDataPartitionDirName; Optional<Path> destPartitionLocation = getDestinationPartitionLocation(destinationTableMeta, workUnit, conversionEntity.getPartition().get().getName()); orcFinalDataPartitionLocation = HiveConverterUtils .updatePartitionLocation(orcFinalDataPartitionLocation, workUnit, destPartitionLocation); log.info("Partition directory to move: " + orcStagingDataPartitionLocation + " to: " + orcFinalDataPartitionLocation); publishDirectories.put(orcStagingDataPartitionLocation, orcFinalDataPartitionLocation); // Step: // A.2.3.1, B.2.3.1: Drop if exists partition in final table // Step: // If destination partition already exists, alter the partition location // A.2.3.3, B.2.3.3: Create partition with location (and update storage format if not in ORC already) List<String> dropPartitionsDDL = HiveAvroORCQueryGenerator.generateDropPartitionsDDL(orcTableDatabase, orcTableName, partitionsDMLInfo); log.debug("Drop partitions if exist in final table: " + dropPartitionsDDL); publishQueries.addAll(dropPartitionsDDL); if (workUnit.getPropAsBoolean(HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY, DEFAULT_HIVE_CONVERSION_SETSERDETOAVROEXPLICITELY)) { List<String> createFinalPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL( orcTableDatabase, orcTableName, orcFinalDataPartitionLocation, partitionsDMLInfo, Optional.<String>absent()); log.debug("Create final partition DDL: " + createFinalPartitionDDL); publishQueries.addAll(createFinalPartitionDDL); // Updating storage format non-transactionally is a stop gap measure until Hive supports transactionally update // .. storage format in ADD PARITTION command (today it only supports specifying location) List<String> updatePartitionStorageFormatDDL = HiveAvroORCQueryGenerator .generateAlterTableOrPartitionStorageFormatDDL(orcTableDatabase, orcTableName, Optional.of(partitionsDMLInfo), ORC_FORMAT); log.debug("Update final partition storage format to ORC (if not already in ORC)"); publishQueries.addAll(updatePartitionStorageFormatDDL); } else { List<String> createFinalPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL( orcTableDatabase, orcTableName, orcFinalDataPartitionLocation, partitionsDMLInfo, Optional.fromNullable(ORC_FORMAT)); log.debug("Create final partition DDL: " + createFinalPartitionDDL); publishQueries.addAll(createFinalPartitionDDL); } // Step: // A.2.3.4, B.2.3.4: Drop this staging table and delete directories String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(orcTableDatabase, orcStagingTableName); log.debug("Drop staging table DDL: " + dropStagingTableDDL); cleanupQueries.add(dropStagingTableDDL); // Delete: orcStagingDataLocation log.info("Staging table directory to delete: " + orcStagingDataLocation); cleanupDirectories.add(orcStagingDataLocation); } /* * Drop the replaced partitions if any. This is required in case the partition being converted is derived from * several other partitions. E.g. Daily partition is a replacement of hourly partitions of the same day. When daily * partition is converted to ORC all it's hourly ORC partitions need to be dropped. */ publishQueries.addAll(HiveAvroORCQueryGenerator.generateDropPartitionsDDL(orcTableDatabase, orcTableName, getDropPartitionsDDLInfo(conversionEntity))); /* * Create or update view over the ORC table if specified in the config (ie. wrapper view name is present in config) */ if (wrapperViewName.isPresent()) { String viewName = wrapperViewName.get(); List<String> createOrUpdateViewDDLs = HiveAvroORCQueryGenerator.generateCreateOrUpdateViewDDL( orcTableDatabase, orcTableName, orcTableDatabase, viewName, shouldUpdateView); log.debug("Create or update View DDLs: " + createOrUpdateViewDDLs); publishQueries.addAll(createOrUpdateViewDDLs); } HiveAvroORCQueryGenerator.serializePublishCommands(workUnit, publishEntity); log.debug("Publish partition entity: " + publishEntity); log.debug("Conversion Query " + conversionEntity.getQueries()); EventWorkunitUtils.setEndDDLBuildTimeMetadata(workUnit, System.currentTimeMillis()); return new SingleRecordIterable<>(conversionEntity); }
From source file:org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset.java
License:Apache License
private List<DatasetDescriptor> createDestDatasets() { List<DatasetDescriptor> destDatasets = new ArrayList<>(); for (String format : getDestFormats()) { Optional<ConversionConfig> conversionConfigForFormat = getConversionConfigForFormat(format); if (!conversionConfigForFormat.isPresent()) { continue; }//from www. j a va2 s .c om String destTable = conversionConfigForFormat.get().getDestinationDbName() + "." + conversionConfigForFormat.get().getDestinationTableName(); DatasetDescriptor dest = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, destTable); String destLocation = conversionConfigForFormat.get().getDestinationDataPath() + Path.SEPARATOR + "final"; dest.addMetadata(DatasetConstants.FS_SCHEME, getSourceDataset().getMetadata().get(DatasetConstants.FS_SCHEME)); dest.addMetadata(DatasetConstants.FS_LOCATION, destLocation); destDatasets.add(dest); } return destDatasets; }
From source file:org.apache.gobblin.data.management.conversion.hive.materializer.HiveMaterializerFromEntityQueryGenerator.java
License:Apache License
public HiveMaterializerFromEntityQueryGenerator(WorkUnitState workUnitState, boolean supportTargetPartitioning) throws IOException { super(workUnitState); try {/*from w ww. ja v a2 s.c o m*/ this.conversionEntity = getConversionEntity(this.workUnit); } catch (TException | HiveException ex) { throw new IOException(ex); } this.sourceTable = this.conversionEntity.getTable(); this.inputDbName = this.sourceTable.getDbName(); this.inputTableName = this.sourceTable.getTableName(); this.sourceDataPathIdentifier = this.outputTableMetadata.getSourceDataPathIdentifier(); this.stagingDataPartitionDirName = HiveConverterUtils.getStagingDataPartitionDirName(conversionEntity, sourceDataPathIdentifier); this.stagingDataPartitionLocation = stagingDataLocation + Path.SEPARATOR + stagingDataPartitionDirName; this.partitionsDDLInfo = Maps.newHashMap(); this.partitionsDMLInfo = Maps.newHashMap(); HiveConverterUtils.populatePartitionInfo(conversionEntity, partitionsDDLInfo, partitionsDMLInfo); this.supportTargetPartitioning = supportTargetPartitioning; }
From source file:org.apache.gobblin.data.management.conversion.hive.materializer.HiveMaterializerFromEntityQueryGenerator.java
License:Apache License
/** * Returns a QueryBasedHivePublishEntity which includes publish level queries and cleanup commands. * @return QueryBasedHivePublishEntity//from ww w . j av a 2s.c om * @throws DataConversionException */ public QueryBasedHivePublishEntity generatePublishQueries() throws DataConversionException { QueryBasedHivePublishEntity publishEntity = new QueryBasedHivePublishEntity(); List<String> publishQueries = publishEntity.getPublishQueries(); Map<String, String> publishDirectories = publishEntity.getPublishDirectories(); List<String> cleanupQueries = publishEntity.getCleanupQueries(); List<String> cleanupDirectories = publishEntity.getCleanupDirectories(); String createFinalTableDDL = HiveConverterUtils.generateCreateDuplicateTableDDL(outputDatabaseName, stagingTableName, outputTableName, outputDataLocation, Optional.of(outputDatabaseName)); publishQueries.add(createFinalTableDDL); log.debug("Create final table DDL:\n" + createFinalTableDDL); if (!this.supportTargetPartitioning || partitionsDDLInfo.size() == 0) { log.debug("Snapshot directory to move: " + stagingDataLocation + " to: " + outputDataLocation); publishDirectories.put(stagingDataLocation, outputDataLocation); String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(outputDatabaseName, stagingTableName); log.debug("Drop staging table DDL: " + dropStagingTableDDL); cleanupQueries.add(dropStagingTableDDL); log.debug("Staging table directory to delete: " + stagingDataLocation); cleanupDirectories.add(stagingDataLocation); } else { String finalDataPartitionLocation = outputDataLocation + Path.SEPARATOR + stagingDataPartitionDirName; Optional<Path> destPartitionLocation = HiveConverterUtils.getDestinationPartitionLocation( destinationTableMeta, this.workUnitState, conversionEntity.getPartition().get().getName()); finalDataPartitionLocation = HiveConverterUtils.updatePartitionLocation(finalDataPartitionLocation, this.workUnitState, destPartitionLocation); log.debug("Partition directory to move: " + stagingDataPartitionLocation + " to: " + finalDataPartitionLocation); publishDirectories.put(stagingDataPartitionLocation, finalDataPartitionLocation); List<String> dropPartitionsDDL = HiveAvroORCQueryGenerator.generateDropPartitionsDDL(outputDatabaseName, outputTableName, partitionsDMLInfo); log.debug("Drop partitions if exist in final table: " + dropPartitionsDDL); publishQueries.addAll(dropPartitionsDDL); List<String> createFinalPartitionDDL = HiveAvroORCQueryGenerator.generateCreatePartitionDDL( outputDatabaseName, outputTableName, finalDataPartitionLocation, partitionsDMLInfo, Optional.<String>absent()); log.debug("Create final partition DDL: " + createFinalPartitionDDL); publishQueries.addAll(createFinalPartitionDDL); String dropStagingTableDDL = HiveAvroORCQueryGenerator.generateDropTableDDL(outputDatabaseName, stagingTableName); log.debug("Drop staging table DDL: " + dropStagingTableDDL); cleanupQueries.add(dropStagingTableDDL); log.debug("Staging table directory to delete: " + stagingDataLocation); cleanupDirectories.add(stagingDataLocation); publishQueries.addAll(HiveAvroORCQueryGenerator.generateDropPartitionsDDL(outputDatabaseName, outputTableName, AbstractAvroToOrcConverter.getDropPartitionsDDLInfo(conversionEntity))); } log.info("Publish partition entity: " + publishEntity); return publishEntity; }