List of usage examples for org.apache.hadoop.fs Path SEPARATOR
String SEPARATOR
To view the source code for org.apache.hadoop.fs Path SEPARATOR.
Click Source Link
From source file:org.apache.gobblin.data.management.conversion.hive.task.HiveConverterUtils.java
License:Apache License
/*** * Get the final table location of format: <final table location>/final * @return final table location./*www .j av a2 s . co m*/ */ public static String getOutputDataLocation(String outputDataLocation) { return outputDataLocation + Path.SEPARATOR + PUBLISHED_TABLE_SUBDIRECTORY; }
From source file:org.apache.gobblin.data.management.conversion.hive.task.HiveConverterUtils.java
License:Apache License
/*** * Get the staging table location of format: <final table location>/<staging table name> * @param outputDataLocation output table data lcoation. * @return staging table location.//from w ww.jav a 2 s.c o m */ public static String getStagingDataLocation(String outputDataLocation, String stagingTableName) { return outputDataLocation + Path.SEPARATOR + stagingTableName; }
From source file:org.apache.gobblin.data.management.conversion.hive.validation.ValidationJob.java
License:Apache License
/*** * Execute Hive queries using {@link HiveJdbcConnector} and validate results. * @param queries Queries to execute./*from ww w .j a v a 2 s .c om*/ */ @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "SQL_NONCONSTANT_STRING_PASSED_TO_EXECUTE", justification = "Temporary fix") private List<Long> getValidationOutputFromHive(List<String> queries) throws IOException { if (null == queries || queries.size() == 0) { log.warn("No queries specified to be executed"); return Collections.emptyList(); } List<Long> rowCounts = Lists.newArrayList(); Closer closer = Closer.create(); try { HiveJdbcConnector hiveJdbcConnector = closer.register(HiveJdbcConnector.newConnectorWithProps(props)); for (String query : queries) { String hiveOutput = "hiveConversionValidationOutput_" + UUID.randomUUID().toString(); Path hiveTempDir = new Path("/tmp" + Path.SEPARATOR + hiveOutput); query = "INSERT OVERWRITE DIRECTORY '" + hiveTempDir + "' " + query; log.info("Executing query: " + query); try { if (this.hiveSettings.size() > 0) { hiveJdbcConnector .executeStatements(this.hiveSettings.toArray(new String[this.hiveSettings.size()])); } hiveJdbcConnector.executeStatements("SET hive.exec.compress.output=false", "SET hive.auto.convert.join=false", query); FileStatus[] fileStatusList = this.fs.listStatus(hiveTempDir); List<FileStatus> files = new ArrayList<>(); for (FileStatus fileStatus : fileStatusList) { if (fileStatus.isFile()) { files.add(fileStatus); } } if (files.size() > 1) { log.warn("Found more than one output file. Should have been one."); } else if (files.size() == 0) { log.warn("Found no output file. Should have been one."); } else { String theString = IOUtils.toString( new InputStreamReader(this.fs.open(files.get(0).getPath()), Charsets.UTF_8)); log.info("Found row count: " + theString.trim()); if (StringUtils.isBlank(theString.trim())) { rowCounts.add(0l); } else { try { rowCounts.add(Long.parseLong(theString.trim())); } catch (NumberFormatException e) { throw new RuntimeException("Could not parse Hive output: " + theString.trim(), e); } } } } finally { if (this.fs.exists(hiveTempDir)) { log.debug("Deleting temp dir: " + hiveTempDir); this.fs.delete(hiveTempDir, true); } } } } catch (SQLException e) { log.warn("Execution failed for query set " + queries.toString(), e); } finally { try { closer.close(); } catch (Exception e) { log.warn("Could not close HiveJdbcConnector", e); } } return rowCounts; }
From source file:org.apache.gobblin.data.management.copy.hive.HiveDatasetFinder.java
License:Apache License
/** * Gets the {@link Config} for this <code>dbAndTable</code>. * Cases:/*from www . j av a 2 s . c o m*/ * <ul> * <li>If {@link #configStoreUri} is available it gets the dataset config from the config store at this uri * <li>If {@link #configStoreUri} is not available it uses the job config as dataset config * <li>If {@link #datasetConfigPrefix} is specified, only configs with this prefix is returned * <li>If {@link #datasetConfigPrefix} is not specified, all configs are returned * </ul> * @param table of the dataset to get config * @return the {@link Config} for <code>dbAndTable</code> */ private Config getDatasetConfig(Table table) throws ConfigStoreFactoryDoesNotExistsException, ConfigStoreCreationException, URISyntaxException { Config datasetConfig; Optional<Config> runtimeConfig = ConfigClientUtils.getOptionalRuntimeConfig(properties); // Config store enabled if (this.configStoreUri.isPresent()) { if (runtimeConfig.isPresent()) { datasetConfig = this.configClient.getConfig( this.configStoreUri.get() + Path.SEPARATOR + this.configStoreDatasetUriBuilder.apply(table), runtimeConfig); } else { datasetConfig = this.configClient.getConfig(this.configStoreUri.get() + Path.SEPARATOR + this.configStoreDatasetUriBuilder.apply(table)); } // If config store is not enabled use job config } else { datasetConfig = this.jobConfig; } return StringUtils.isBlank(this.datasetConfigPrefix) ? datasetConfig : ConfigUtils.getConfig(datasetConfig, this.datasetConfigPrefix, ConfigFactory.empty()); }
From source file:org.apache.gobblin.data.management.copy.writer.TarArchiveInputStreamDataWriter.java
License:Apache License
/** * Untars the passed in {@link FileAwareInputStream} to the task's staging directory. Uses the name of the root * {@link TarArchiveEntry} in the stream as the directory name for the untarred file. The method also commits the data * by moving the file from staging to output directory. * * @see org.apache.gobblin.data.management.copy.writer.FileAwareInputStreamDataWriter#write(org.apache.gobblin.data.management.copy.FileAwareInputStream) */// ww w . j a v a 2s .c om @Override public void writeImpl(InputStream inputStream, Path writeAt, CopyableFile copyableFile, FileAwareInputStream record) throws IOException { this.closer.register(inputStream); TarArchiveInputStream tarIn = new TarArchiveInputStream(inputStream); final ReadableByteChannel inputChannel = Channels.newChannel(tarIn); TarArchiveEntry tarEntry; // flush the first entry in the tar, which is just the root directory tarEntry = tarIn.getNextTarEntry(); String tarEntryRootName = StringUtils.remove(tarEntry.getName(), Path.SEPARATOR); log.info("Unarchiving at " + writeAt); try { while ((tarEntry = tarIn.getNextTarEntry()) != null) { // the API tarEntry.getName() is misleading, it is actually the path of the tarEntry in the tar file String newTarEntryPath = tarEntry.getName().replace(tarEntryRootName, writeAt.getName()); Path tarEntryStagingPath = new Path(writeAt.getParent(), newTarEntryPath); if (!FileUtils.isSubPath(writeAt.getParent(), tarEntryStagingPath)) { throw new IOException( String.format("Extracted file: %s is trying to write outside of output directory: %s", tarEntryStagingPath, writeAt.getParent())); } if (tarEntry.isDirectory() && !this.fs.exists(tarEntryStagingPath)) { this.fs.mkdirs(tarEntryStagingPath); } else if (!tarEntry.isDirectory()) { FSDataOutputStream out = this.fs.create(tarEntryStagingPath, true); final WritableByteChannel outputChannel = Channels.newChannel(out); try { StreamCopier copier = new StreamCopier(inputChannel, outputChannel); if (isInstrumentationEnabled()) { copier.withCopySpeedMeter(this.copySpeedMeter); } this.bytesWritten.addAndGet(copier.copy()); if (isInstrumentationEnabled()) { log.info("File {}: copied {} bytes, average rate: {} B/s", copyableFile.getOrigin().getPath(), this.copySpeedMeter.getCount(), this.copySpeedMeter.getMeanRate()); } else { log.info("File {} copied.", copyableFile.getOrigin().getPath()); } } finally { out.close(); outputChannel.close(); } } } } finally { tarIn.close(); inputChannel.close(); inputStream.close(); } }
From source file:org.apache.gobblin.source.DatePartitionedNestedRetriever.java
License:Apache License
private String constructPartitionPath(DateTime date) { StringBuilder pathBuilder = new StringBuilder(); if (!this.sourcePartitionPrefix.isEmpty()) { pathBuilder.append(this.sourcePartitionPrefix); pathBuilder.append(Path.SEPARATOR); }//from w ww . j a v a 2s. co m pathBuilder.append(this.partitionPatternFormatter.print(date)); if (!this.sourcePartitionSuffix.isEmpty()) { pathBuilder.append(Path.SEPARATOR); pathBuilder.append(this.sourcePartitionSuffix); } return pathBuilder.toString(); }
From source file:org.apache.gobblin.source.extractor.DatePartitionedAvroFileExtractorTest.java
License:Apache License
@Test public void testJobStateNotCopiedToWorkUnit() { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.prefix", PREFIX); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); String dummyKey = "dummy.job.config"; state.setProp(dummyKey, "dummy"); List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 4); for (WorkUnit wu : workunits) { if (wu instanceof MultiWorkUnit) { for (WorkUnit workUnit : ((MultiWorkUnit) wu).getWorkUnits()) { Assert.assertFalse(workUnit.contains(dummyKey)); }/*from w w w . ja v a 2 s . co m*/ } else { Assert.assertFalse(wu.contains(dummyKey)); } } }
From source file:org.apache.gobblin.source.extractor.DatePartitionedAvroFileExtractorTest.java
License:Apache License
@Test public void testReadPartitionsByMinuteWithLeadtime() throws IOException, DataRecordException { DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource(); SourceState state = new SourceState(); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI); state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY); state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2); state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN); state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1))); state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY); state.setProp("date.partitioned.source.partition.prefix", PREFIX); state.setProp("date.partitioned.source.partition.suffix", SUFFIX); state.setProp("date.partitioned.source.partition.lead_time.size", "3"); state.setProp("date.partitioned.source.partition.lead_time.granularity", "HOUR"); /*/*from w w w.j av a2 s . c o m*/ * Since lead time is 3 hours, only the first WorkUnit (which is 6 hours old, rest are 2hrs) should get * picked up */ List<WorkUnit> workunits = source.getWorkunits(state); Assert.assertEquals(workunits.size(), 1); verifyWorkUnits(workunits, workunits.size()); }
From source file:org.apache.gobblin.util.WriterUtils.java
License:Apache License
/** * Creates {@link Path} for case {@link WriterFilePathType#NAMESPACE_TABLE} with configurations * {@link ConfigurationKeys#EXTRACT_NAMESPACE_NAME_KEY} and {@link ConfigurationKeys#EXTRACT_TABLE_NAME_KEY} * @param state//from w w w. j a v a 2 s . com * @return a path */ public static Path getNamespaceTableWriterFilePath(State state) { Preconditions.checkArgument(state.contains(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY)); Preconditions.checkArgument(state.contains(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)); String namespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY).replaceAll("\\.", Path.SEPARATOR); return new Path(namespace + Path.SEPARATOR + state.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY)); }
From source file:org.apache.gobblin.writer.AvroHdfsDataWriterTest.java
License:Apache License
@Test public void testWrite() throws IOException { // Write all test records for (String record : TestConstants.JSON_RECORDS) { this.writer.write(convertRecord(record)); }//from w ww . j a v a 2 s. c o m Assert.assertEquals(this.writer.recordsWritten(), 3); this.writer.close(); this.writer.commit(); File outputFile = new File(TestConstants.TEST_OUTPUT_DIR + Path.SEPARATOR + this.filePath, TestConstants.TEST_FILE_NAME); DataFileReader<GenericRecord> reader = new DataFileReader<>(outputFile, new GenericDatumReader<GenericRecord>(this.schema)); // Read the records back and assert they are identical to the ones written GenericRecord user1 = reader.next(); // Strings are in UTF8, so we have to call toString() here and below Assert.assertEquals(user1.get("name").toString(), "Alyssa"); Assert.assertEquals(user1.get("favorite_number"), 256); Assert.assertEquals(user1.get("favorite_color").toString(), "yellow"); GenericRecord user2 = reader.next(); Assert.assertEquals(user2.get("name").toString(), "Ben"); Assert.assertEquals(user2.get("favorite_number"), 7); Assert.assertEquals(user2.get("favorite_color").toString(), "red"); GenericRecord user3 = reader.next(); Assert.assertEquals(user3.get("name").toString(), "Charlie"); Assert.assertEquals(user3.get("favorite_number"), 68); Assert.assertEquals(user3.get("favorite_color").toString(), "blue"); reader.close(); FsWriterMetrics metrics = FsWriterMetrics.fromJson(properties.getProp(FsDataWriter.FS_WRITER_METRICS_KEY)); Assert.assertEquals(metrics.fileInfos.size(), 1); FsWriterMetrics.FileInfo fileInfo = metrics.fileInfos.iterator().next(); Assert.assertEquals(fileInfo.fileName, TestConstants.TEST_FILE_NAME); Assert.assertEquals(fileInfo.numRecords, 3); Assert.assertNull(metrics.partitionInfo.partitionKey); Assert.assertEquals(metrics.partitionInfo.branchId, 0); }