List of usage examples for org.apache.hadoop.fs FileSystem getUri
public abstract URI getUri();
From source file:org.apache.giraph.debugger.utils.AsyncHDFSWriteService.java
License:Apache License
/** * Writes given protobuf message to the given filesystem path in the * background.//from w ww .j ava2 s . com * * @param message * The proto message to write. * @param fs * The HDFS filesystem to write to. * @param fileName * The HDFS path to write the message to. */ public static void writeToHDFS(final GeneratedMessage message, final FileSystem fs, final String fileName) { HDFS_ASYNC_WRITE_SERVICE.submit(new Runnable() { @Override public void run() { Path pt = new Path(fileName); try { LOG.info("Writing " + fileName + " at " + fs.getUri()); OutputStream wrappedStream = fs.create(pt, true).getWrappedStream(); message.writeTo(wrappedStream); wrappedStream.close(); LOG.info("Done writing " + fileName); } catch (IOException e) { e.printStackTrace(); } } }); }
From source file:org.apache.giraph.utils.DistributedCacheUtils.java
License:Apache License
/** * Copy a file to HDFS if it is local. If the path is already in HDFS, this * call does nothing.//from w w w. j a va 2s. co m * * @param path path to file * @param conf Configuration * @return path to file on HDFS. */ public static Path copyToHdfs(Path path, Configuration conf) { if (path.toString().startsWith("hdfs://")) { // Already on HDFS return path; } FileSystem fs = null; try { fs = FileSystem.get(conf); } catch (IOException e) { throw new IllegalArgumentException("Failed to get HDFS FileSystem", e); } String name = getBaseName(path.toString()) + "-" + System.nanoTime(); Path remotePath = new Path("/tmp/giraph", name); LOG.info("copyToHdfsIfNecessary: Copying " + path + " to " + remotePath + " on hdfs " + fs.getUri()); try { fs.copyFromLocalFile(false, true, path, remotePath); } catch (IOException e) { throw new IllegalArgumentException("Failed to copy jython script from local path " + path + " to hdfs path " + remotePath + " on hdfs " + fs.getUri(), e); } return remotePath; }
From source file:org.apache.giraph.yarn.GiraphYarnClient.java
License:Apache License
/** * Set delegation tokens for AM container * @param amContainer AM container//ww w . j a va 2 s . c o m * @return */ private void setToken(ContainerLaunchContext amContainer) throws IOException { // Setup security tokens if (UserGroupInformation.isSecurityEnabled()) { Credentials credentials = new Credentials(); String tokenRenewer = giraphConf.get(YarnConfiguration.RM_PRINCIPAL); if (tokenRenewer == null || tokenRenewer.length() == 0) { throw new IOException("Can't get Master Kerberos principal for the RM to use as renewer"); } FileSystem fs = FileSystem.get(giraphConf); // For now, only getting tokens for the default file-system. final Token<?>[] tokens = fs.addDelegationTokens(tokenRenewer, credentials); if (tokens != null) { for (Token<?> token : tokens) { LOG.info("Got dt for " + fs.getUri() + "; " + token); } } DataOutputBuffer dob = new DataOutputBuffer(); credentials.writeTokenStorageToStream(dob); ByteBuffer fsTokens = ByteBuffer.wrap(dob.getData(), 0, dob.getLength()); amContainer.setTokens(fsTokens); } }
From source file:org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDatasetTest.java
License:Apache License
public static ConvertibleHiveDataset createTestConvertibleDataset(Config config) throws URISyntaxException { Table table = getTestTable("db1", "tb1"); FileSystem mockFs = Mockito.mock(FileSystem.class); when(mockFs.getUri()).thenReturn(new URI("test")); ConvertibleHiveDataset cd = new ConvertibleHiveDataset(mockFs, Mockito.mock(HiveMetastoreClientPool.class), new org.apache.hadoop.hive.ql.metadata.Table(table), new Properties(), config); return cd;/*ww w. ja va2s .c o m*/ }
From source file:org.apache.gobblin.data.management.copy.CopyableFile.java
License:Apache License
/** * Set file system based source and destination dataset for this {@link CopyableFile} * * @param originFs {@link FileSystem} where this {@link CopyableFile} origins * @param targetFs {@link FileSystem} where this {@link CopyableFile} is copied to */// w w w.j a v a 2s . c o m public void setFsDatasets(FileSystem originFs, FileSystem targetFs) { /* * By default, the raw Gobblin dataset for CopyableFile lineage is its parent folder * if itself is not a folder */ boolean isDir = origin.isDirectory(); Path fullSourcePath = Path.getPathWithoutSchemeAndAuthority(origin.getPath()); String sourceDatasetName = isDir ? fullSourcePath.toString() : fullSourcePath.getParent().toString(); DatasetDescriptor sourceDataset = new DatasetDescriptor(originFs.getScheme(), sourceDatasetName); sourceDataset.addMetadata(DatasetConstants.FS_URI, originFs.getUri().toString()); sourceData = sourceDataset; Path fullDestinationPath = Path.getPathWithoutSchemeAndAuthority(destination); String destinationDatasetName = isDir ? fullDestinationPath.toString() : fullDestinationPath.getParent().toString(); DatasetDescriptor destinationDataset = new DatasetDescriptor(targetFs.getScheme(), destinationDatasetName); destinationDataset.addMetadata(DatasetConstants.FS_URI, targetFs.getUri().toString()); destinationData = destinationDataset; }
From source file:org.apache.gobblin.data.management.copy.CopyableFileTest.java
License:Apache License
@Test public void testSetFsDatasets() throws URISyntaxException { FileSystem originFs = mock(FileSystem.class); String originFsUri = "hdfs://source.company.biz:2000"; String originPath = "/data/databases/source/profile"; when(originFs.getUri()).thenReturn(new URI(originFsUri)); when(originFs.getScheme()).thenReturn("hdfs"); FileSystem targetFs = mock(FileSystem.class); String targetFsUri = "file:///"; String destinationPath = "/data/databases/destination/profile"; when(targetFs.getUri()).thenReturn(new URI(targetFsUri)); when(targetFs.getScheme()).thenReturn("file"); // Test when source file is not a directory FileStatus origin = new FileStatus(0l, false, 0, 0l, 0l, new Path(originPath)); CopyableFile copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "", null); copyableFile.setFsDatasets(originFs, targetFs); DatasetDescriptor source = (DatasetDescriptor) copyableFile.getSourceData(); Assert.assertEquals(source.getName(), "/data/databases/source"); Assert.assertEquals(source.getPlatform(), "hdfs"); Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri); DatasetDescriptor destination = (DatasetDescriptor) copyableFile.getDestinationData(); Assert.assertEquals(destination.getName(), "/data/databases/destination"); Assert.assertEquals(destination.getPlatform(), "file"); Assert.assertEquals(destination.getMetadata().get("fsUri"), targetFsUri); // Test when source file is a directory originPath = originFsUri + originPath; destinationPath = targetFsUri + destinationPath; origin = new FileStatus(0l, true, 0, 0l, 0l, new Path(originPath)); copyableFile = new CopyableFile(origin, new Path(destinationPath), null, null, null, PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "", null); copyableFile.setFsDatasets(originFs, targetFs); source = (DatasetDescriptor) copyableFile.getSourceData(); Assert.assertEquals(source.getName(), "/data/databases/source/profile"); Assert.assertEquals(source.getPlatform(), "hdfs"); Assert.assertEquals(source.getMetadata().get("fsUri"), originFsUri); destination = (DatasetDescriptor) copyableFile.getDestinationData(); Assert.assertEquals(destination.getName(), "/data/databases/destination/profile"); Assert.assertEquals(destination.getPlatform(), "file"); Assert.assertEquals(destination.getMetadata().get("fsUri"), targetFsUri); }
From source file:org.apache.gobblin.data.management.copy.CopySource.java
License:Apache License
/** * <ul>/* www.ja v a 2s .c o m*/ * Does the following: * <li>Instantiate a {@link DatasetsFinder}. * <li>Find all {@link Dataset} using {@link DatasetsFinder}. * <li>For each {@link CopyableDataset} get all {@link CopyEntity}s. * <li>Create a {@link WorkUnit} per {@link CopyEntity}. * </ul> * * <p> * In this implementation, one workunit is created for every {@link CopyEntity} found. But the extractor/converters * and writers are built to support multiple {@link CopyEntity}s per workunit * </p> * * @param state see {@link org.apache.gobblin.configuration.SourceState} * @return Work units for copying files. */ @Override public List<WorkUnit> getWorkunits(final SourceState state) { this.metricContext = Instrumented.getMetricContext(state, CopySource.class); this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker()); try { DeprecationUtils.renameDeprecatedKeys(state, CopyConfiguration.MAX_COPY_PREFIX + "." + CopyResourcePool.ENTITIES_KEY, Lists.newArrayList(MAX_FILES_COPIED_KEY)); final FileSystem sourceFs = HadoopUtils.getSourceFileSystem(state); final FileSystem targetFs = HadoopUtils.getWriterFileSystem(state, 1, 0); state.setProp(SlaEventKeys.SOURCE_URI, sourceFs.getUri()); state.setProp(SlaEventKeys.DESTINATION_URI, targetFs.getUri()); log.info("Identified source file system at {} and target file system at {}.", sourceFs.getUri(), targetFs.getUri()); long maxSizePerBin = state.getPropAsLong(MAX_SIZE_MULTI_WORKUNITS, 0); long maxWorkUnitsPerMultiWorkUnit = state.getPropAsLong(MAX_WORK_UNITS_PER_BIN, 50); final long minWorkUnitWeight = Math.max(1, maxSizePerBin / maxWorkUnitsPerMultiWorkUnit); final Optional<CopyableFileWatermarkGenerator> watermarkGenerator = CopyableFileWatermarkHelper .getCopyableFileWatermarkGenerator(state); int maxThreads = state.getPropAsInt(MAX_CONCURRENT_LISTING_SERVICES, DEFAULT_MAX_CONCURRENT_LISTING_SERVICES); final CopyConfiguration copyConfiguration = CopyConfiguration.builder(targetFs, state.getProperties()) .build(); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, CopyConfiguration.COPY_PREFIX) .build(); DatasetsFinder<CopyableDatasetBase> datasetFinder = DatasetUtils.instantiateDatasetFinder( state.getProperties(), sourceFs, DEFAULT_DATASET_PROFILE_CLASS_KEY, this.eventSubmitter, state); IterableDatasetFinder<CopyableDatasetBase> iterableDatasetFinder = datasetFinder instanceof IterableDatasetFinder ? (IterableDatasetFinder<CopyableDatasetBase>) datasetFinder : new IterableDatasetFinderImpl<>(datasetFinder); Iterator<CopyableDatasetRequestor> requestorIteratorWithNulls = Iterators.transform( iterableDatasetFinder.getDatasetsIterator(), new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log)); Iterator<CopyableDatasetRequestor> requestorIterator = Iterators.filter(requestorIteratorWithNulls, Predicates.<CopyableDatasetRequestor>notNull()); final SetMultimap<FileSet<CopyEntity>, WorkUnit> workUnitsMap = Multimaps .<FileSet<CopyEntity>, WorkUnit>synchronizedSetMultimap( HashMultimap.<FileSet<CopyEntity>, WorkUnit>create()); RequestAllocator<FileSet<CopyEntity>> allocator = createRequestAllocator(copyConfiguration, maxThreads); Iterator<FileSet<CopyEntity>> prioritizedFileSets = allocator.allocateRequests(requestorIterator, copyConfiguration.getMaxToCopy()); //Submit alertable events for unfulfilled requests submitUnfulfilledRequestEvents(allocator); String filesetWuGeneratorAlias = state.getProp(ConfigurationKeys.COPY_SOURCE_FILESET_WU_GENERATOR_CLASS, FileSetWorkUnitGenerator.class.getName()); Iterator<Callable<Void>> callableIterator = Iterators.transform(prioritizedFileSets, new Function<FileSet<CopyEntity>, Callable<Void>>() { @Nullable @Override public Callable<Void> apply(FileSet<CopyEntity> input) { try { return GobblinConstructorUtils.<FileSetWorkUnitGenerator>invokeLongestConstructor( new ClassAliasResolver(FileSetWorkUnitGenerator.class).resolveClass( filesetWuGeneratorAlias), input.getDataset(), input, state, targetFs, workUnitsMap, watermarkGenerator, minWorkUnitWeight, lineageInfo); } catch (Exception e) { throw new RuntimeException("Cannot create workunits generator", e); } } }); try { List<Future<Void>> futures = new IteratorExecutor<>(callableIterator, maxThreads, ExecutorsUtils .newDaemonThreadFactory(Optional.of(log), Optional.of("Copy-file-listing-pool-%d"))) .execute(); for (Future<Void> future : futures) { try { future.get(); } catch (ExecutionException exc) { log.error("Failed to get work units for dataset.", exc.getCause()); } } } catch (InterruptedException ie) { log.error("Retrieval of work units was interrupted. Aborting."); return Lists.newArrayList(); } log.info(String.format("Created %s workunits ", workUnitsMap.size())); copyConfiguration.getCopyContext().logCacheStatistics(); if (state.contains(SIMULATE) && state.getPropAsBoolean(SIMULATE)) { log.info("Simulate mode enabled. Will not execute the copy."); for (Map.Entry<FileSet<CopyEntity>, Collection<WorkUnit>> entry : workUnitsMap.asMap().entrySet()) { log.info(String.format("Actions for dataset %s file set %s.", entry.getKey().getDataset().datasetURN(), entry.getKey().getName())); for (WorkUnit workUnit : entry.getValue()) { try { CopyEntity copyEntity = deserializeCopyEntity(workUnit); log.info(copyEntity.explain()); } catch (Exception e) { log.info("Cannot deserialize CopyEntity from wu : {}", workUnit.toString()); } } } return Lists.newArrayList(); } List<? extends WorkUnit> workUnits = new WorstFitDecreasingBinPacking(maxSizePerBin) .pack(Lists.newArrayList(workUnitsMap.values()), this.weighter); log.info(String.format( "Bin packed work units. Initial work units: %d, packed work units: %d, max weight per bin: %d, " + "max work units per bin: %d.", workUnitsMap.size(), workUnits.size(), maxSizePerBin, maxWorkUnitsPerMultiWorkUnit)); return ImmutableList.copyOf(workUnits); } catch (IOException e) { throw new RuntimeException(e); } }
From source file:org.apache.gobblin.data.management.copy.replication.ConfigBasedDatasetTest.java
License:Apache License
public Collection<? extends CopyEntity> testGetCopyableFilesHelper(String sourceDir, String destinationDir, long sourceWatermark, boolean isFilterEnabled) throws Exception { FileSystem localFs = FileSystem.getLocal(new Configuration()); URI local = localFs.getUri(); Properties properties = new Properties(); properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/publisher"); PathFilter pathFilter = DatasetUtils.instantiatePathFilter(properties); boolean applyFilterToDirectories = false; if (isFilterEnabled) { properties.setProperty(DatasetUtils.CONFIGURATION_KEY_PREFIX + "path.filter.class", "org.apache.gobblin.util.filters.HiddenFilter"); properties.setProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "true"); pathFilter = DatasetUtils.instantiatePathFilter(properties); applyFilterToDirectories = Boolean .parseBoolean(properties.getProperty(CopyConfiguration.APPLY_FILTER_TO_DIRECTORIES, "false")); }/*w w w.j a va2s. com*/ CopyConfiguration copyConfiguration = CopyConfiguration .builder(FileSystem.getLocal(new Configuration()), properties).publishDir(new Path(destinationDir)) .preserve(PreserveAttributes.fromMnemonicString("ugp")).build(); ReplicationMetaData mockMetaData = Mockito.mock(ReplicationMetaData.class); Mockito.when(mockMetaData.toString()).thenReturn("Mock Meta Data"); ReplicationConfiguration mockRC = Mockito.mock(ReplicationConfiguration.class); Mockito.when(mockRC.getCopyMode()).thenReturn(ReplicationCopyMode.PULL); Mockito.when(mockRC.getMetaData()).thenReturn(mockMetaData); Mockito.when(mockRC.getVersionStrategyFromConfigStore()) .thenReturn(Optional.of(DataFileVersionStrategy.DEFAULT_DATA_FILE_VERSION_STRATEGY)); Mockito.when(mockRC.getEnforceFileSizeMatchFromConfigStore()).thenReturn(Optional.absent()); HadoopFsEndPoint copyFrom = Mockito.mock(HadoopFsEndPoint.class); Mockito.when(copyFrom.getDatasetPath()).thenReturn(new Path(sourceDir)); Mockito.when(copyFrom.getFsURI()).thenReturn(local); ComparableWatermark sw = new LongWatermark(sourceWatermark); Mockito.when(copyFrom.getWatermark()).thenReturn(Optional.of(sw)); Mockito.when(copyFrom.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(sourceDir), pathFilter, applyFilterToDirectories)); HadoopFsEndPoint copyTo = Mockito.mock(HadoopFsEndPoint.class); Mockito.when(copyTo.getDatasetPath()).thenReturn(new Path(destinationDir)); Mockito.when(copyTo.getFsURI()).thenReturn(local); Optional<ComparableWatermark> tmp = Optional.absent(); Mockito.when(copyTo.getWatermark()).thenReturn(tmp); Mockito.when(copyTo.getFiles()).thenReturn(FileListUtils.listFilesRecursively(localFs, new Path(destinationDir), pathFilter, applyFilterToDirectories)); CopyRoute route = Mockito.mock(CopyRoute.class); Mockito.when(route.getCopyFrom()).thenReturn(copyFrom); Mockito.when(route.getCopyTo()).thenReturn(copyTo); ConfigBasedDataset dataset = new ConfigBasedDataset(mockRC, properties, route); Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(localFs, copyConfiguration); return copyableFiles; }
From source file:org.apache.gobblin.data.management.copy.replication.ConfigBasedMultiDatasets.java
License:Apache License
public ConfigBasedMultiDatasets(Config c, Properties props, Optional<List<String>> blacklistPatterns) { this.props = props; blacklist = patternListInitHelper(blacklistPatterns); try {/* www . j a v a2 s . c om*/ FileSystem executionCluster = FileSystem.get(new Configuration()); URI executionClusterURI = executionCluster.getUri(); ReplicationConfiguration rc = ReplicationConfiguration.buildFromConfig(c); // push mode if (this.props.containsKey(REPLICATION_PUSH_MODE) && Boolean.parseBoolean(this.props.getProperty(REPLICATION_PUSH_MODE))) { generateDatasetInPushMode(rc, executionClusterURI); } // default pull mode else { generateDatasetInPullMode(rc, executionClusterURI); } } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) { log.error("Can not create Replication Configuration from raw config " + c.root().render(ConfigRenderOptions.defaults().setComments(false).setOriginComments(false)), e); } catch (IOException ioe) { log.error("Can not decide current execution cluster ", ioe); } }
From source file:org.apache.gobblin.data.management.copy.splitter.DistcpFileSplitter.java
License:Apache License
/** * @param state {@link State} containing properties for a job. * @param targetFs destination {@link FileSystem} where file is to be copied * @return whether to allow for splitting of work units based on the filesystem, state, converter/writer config. *//*from w w w.ja v a 2s .c o m*/ public static boolean allowSplit(State state, FileSystem targetFs) { // Don't allow distcp jobs that use decrypt/ungzip converters or tararchive/encrypt writers to split work units Collection<String> converterClassNames = Collections.emptyList(); if (state.contains(ConfigurationKeys.CONVERTER_CLASSES_KEY)) { converterClassNames = state.getPropAsList(ConfigurationKeys.CONVERTER_CLASSES_KEY); } return state.getPropAsBoolean(SPLIT_ENABLED, false) && KNOWN_SCHEMES_SUPPORTING_CONCAT.contains(targetFs.getUri().getScheme()) && state.getProp(ConfigurationKeys.WRITER_BUILDER_CLASS, "") .equals(FileAwareInputStreamDataWriterBuilder.class.getName()) && converterClassNames.stream().noneMatch(s -> !s.equals(IdentityConverter.class.getName())); }