List of usage examples for org.apache.hadoop.fs LocatedFileStatus getPath
public Path getPath()
From source file:io.druid.storage.hdfs.HdfsDataSegmentFinder.java
License:Apache License
@Override public Set<DataSegment> findSegments(String workingDirPathStr, boolean updateDescriptor) throws SegmentLoadingException { final Set<DataSegment> segments = Sets.newHashSet(); final Path workingDirPath = new Path(workingDirPathStr); FileSystem fs;/*from w w w .j ava2 s .c om*/ try { fs = workingDirPath.getFileSystem(config); log.info(fs.getScheme()); log.info("FileSystem URI:" + fs.getUri().toString()); if (!fs.exists(workingDirPath)) { throw new SegmentLoadingException("Working directory [%s] doesn't exist.", workingDirPath); } if (!fs.isDirectory(workingDirPath)) { throw new SegmentLoadingException("Working directory [%s] is not a directory!?", workingDirPath); } final RemoteIterator<LocatedFileStatus> it = fs.listFiles(workingDirPath, true); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); final Path path = locatedFileStatus.getPath(); if (path.getName().endsWith("descriptor.json")) { final Path indexZip; final String descriptorParts[] = path.getName().split("_"); if (descriptorParts.length == 2 && descriptorParts[1].equals("descriptor.json") && org.apache.commons.lang.StringUtils.isNumeric(descriptorParts[0])) { indexZip = new Path(path.getParent(), StringUtils.format("%s_index.zip", descriptorParts[0])); } else { indexZip = new Path(path.getParent(), "index.zip"); } if (fs.exists(indexZip)) { final DataSegment dataSegment = mapper.readValue(fs.open(path), DataSegment.class); log.info("Found segment [%s] located at [%s]", dataSegment.getIdentifier(), indexZip); final Map<String, Object> loadSpec = dataSegment.getLoadSpec(); final String pathWithoutScheme = indexZip.toUri().getPath(); if (!loadSpec.get("type").equals(HdfsStorageDruidModule.SCHEME) || !loadSpec.get("path").equals(pathWithoutScheme)) { loadSpec.put("type", HdfsStorageDruidModule.SCHEME); loadSpec.put("path", pathWithoutScheme); if (updateDescriptor) { log.info("Updating loadSpec in descriptor.json at [%s] with new path [%s]", path, pathWithoutScheme); mapper.writeValue(fs.create(path, true), dataSegment); } } segments.add(dataSegment); } else { throw new SegmentLoadingException( "index.zip didn't exist at [%s] while descripter.json exists!?", indexZip); } } } } catch (IOException e) { throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", workingDirPath); } return segments; }
From source file:io.druid.storage.hdfs.HdfsDataSegmentPuller.java
License:Apache License
public FileUtils.FileCopyResult getSegmentFiles(final Path path, final File outDir) throws SegmentLoadingException { final LocalFileSystem localFileSystem = new LocalFileSystem(); try {//w w w . j a va 2s.co m final FileSystem fs = path.getFileSystem(config); if (fs.isDirectory(path)) { // -------- directory --------- try { return RetryUtils.retry(new Callable<FileUtils.FileCopyResult>() { @Override public FileUtils.FileCopyResult call() throws Exception { if (!fs.exists(path)) { throw new SegmentLoadingException("No files found at [%s]", path.toString()); } final RemoteIterator<LocatedFileStatus> children = fs.listFiles(path, false); final ArrayList<FileUtils.FileCopyResult> localChildren = new ArrayList<>(); final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult(); while (children.hasNext()) { final LocatedFileStatus child = children.next(); final Path childPath = child.getPath(); final String fname = childPath.getName(); if (fs.isDirectory(childPath)) { log.warn("[%s] is a child directory, skipping", childPath.toString()); } else { final File outFile = new File(outDir, fname); // Actual copy fs.copyToLocalFile(childPath, new Path(outFile.toURI())); result.addFile(outFile); } } log.info("Copied %d bytes from [%s] to [%s]", result.size(), path.toString(), outDir.getAbsolutePath()); return result; } }, shouldRetryPredicate(), DEFAULT_RETRY_COUNT); } catch (Exception e) { throw Throwables.propagate(e); } } else if (CompressionUtils.isZip(path.getName())) { // -------- zip --------- final FileUtils.FileCopyResult result = CompressionUtils.unzip(new ByteSource() { @Override public InputStream openStream() throws IOException { return getInputStream(path); } }, outDir, shouldRetryPredicate(), false); log.info("Unzipped %d bytes from [%s] to [%s]", result.size(), path.toString(), outDir.getAbsolutePath()); return result; } else if (CompressionUtils.isGz(path.getName())) { // -------- gzip --------- final String fname = path.getName(); final File outFile = new File(outDir, CompressionUtils.getGzBaseName(fname)); final FileUtils.FileCopyResult result = CompressionUtils.gunzip(new ByteSource() { @Override public InputStream openStream() throws IOException { return getInputStream(path); } }, outFile); log.info("Gunzipped %d bytes from [%s] to [%s]", result.size(), path.toString(), outFile.getAbsolutePath()); return result; } else { throw new SegmentLoadingException("Do not know how to handle file type at [%s]", path.toString()); } } catch (IOException e) { throw new SegmentLoadingException(e, "Error loading [%s]", path.toString()); } }
From source file:io.fluo.webindex.data.LoadHdfs.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length != 1) { log.error("Usage: LoadHdfs <dataDir>"); System.exit(1);/* www. j av a 2s.c o m*/ } final String dataDir = args[0]; IndexEnv.validateDataDir(dataDir); final String hadoopConfDir = IndexEnv.getHadoopConfDir(); final int rateLimit = DataConfig.load().getLoadRateLimit(); List<String> loadPaths = new ArrayList<>(); FileSystem hdfs = IndexEnv.getHDFS(); RemoteIterator<LocatedFileStatus> listIter = hdfs.listFiles(new Path(dataDir), true); while (listIter.hasNext()) { LocatedFileStatus status = listIter.next(); if (status.isFile()) { loadPaths.add(status.getPath().toString()); } } log.info("Loading {} files into Fluo from {}", loadPaths.size(), dataDir); SparkConf sparkConf = new SparkConf().setAppName("webindex-load-hdfs"); try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) { JavaRDD<String> paths = ctx.parallelize(loadPaths, loadPaths.size()); paths.foreachPartition(iter -> { final FluoConfiguration fluoConfig = new FluoConfiguration(new File("fluo.properties")); final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null; FileSystem fs = IndexEnv.getHDFS(hadoopConfDir); try (FluoClient client = FluoFactory.newClient(fluoConfig); LoaderExecutor le = client.newLoaderExecutor()) { iter.forEachRemaining(path -> { Path filePath = new Path(path); try { if (fs.exists(filePath)) { FSDataInputStream fsin = fs.open(filePath); ArchiveReader reader = WARCReaderFactory.get(filePath.getName(), fsin, true); for (ArchiveRecord record : reader) { Page page = ArchiveUtil.buildPageIgnoreErrors(record); if (page.getOutboundLinks().size() > 0) { log.info("Loading page {} with {} links", page.getUrl(), page.getOutboundLinks().size()); if (rateLimiter != null) { rateLimiter.acquire(); } le.execute(PageLoader.updatePage(page)); } } } } catch (IOException e) { log.error("Exception while processing {}", path, e); } }); } }); } }
From source file:io.prestosql.plugin.hive.util.HiveFileIterator.java
License:Apache License
@Override protected LocatedFileStatus computeNext() { while (true) { while (remoteIterator.hasNext()) { LocatedFileStatus status = getLocatedFileStatus(remoteIterator); // Ignore hidden files and directories. Hive ignores files starting with _ and . as well. String fileName = status.getPath().getName(); if (fileName.startsWith("_") || fileName.startsWith(".")) { continue; }//from ww w .j a v a 2 s .co m if (status.isDirectory()) { switch (nestedDirectoryPolicy) { case IGNORED: continue; case RECURSE: paths.add(status.getPath()); continue; case FAIL: throw new NestedDirectoryNotAllowedException(); } } return status; } if (paths.isEmpty()) { return endOfData(); } remoteIterator = getLocatedFileStatusRemoteIterator(paths.removeFirst()); } }
From source file:nl.kpmg.lcm.server.data.hdfs.HdfsFileSystemAdapter.java
License:Apache License
@Override public List listFileNames(String subPath) throws IOException { Configuration conf = new Configuration(); conf.set("fs.defaultFS", storage.getUrl()); FileSystem hdfs = FileSystem.get(conf); String storagePath = "/" + storage.getPath() + "/" + subPath; Path filePath = new Path(storagePath); if (!hdfs.exists(filePath)) { return null; }//from w w w . jav a2s. c o m RemoteIterator<LocatedFileStatus> fileList = hdfs.listFiles(filePath, false); LinkedList<String> fileNameList = new LinkedList(); while (fileList.hasNext()) { LocatedFileStatus fileStatus = fileList.next(); fileNameList.add(fileStatus.getPath().getName()); } return fileNameList; }
From source file:org.apache.apex.malhar.lib.state.managed.BucketsFileSystem.java
License:Apache License
protected void deleteTimeBucketsLessThanEqualTo(long latestExpiredTimeBucket) throws IOException { LOG.debug("delete files before {}", latestExpiredTimeBucket); for (long bucketName : bucketNamesOnFS) { RemoteIterator<LocatedFileStatus> timeBucketsIterator = listFiles(bucketName); boolean emptyBucket = true; while (timeBucketsIterator.hasNext()) { LocatedFileStatus timeBucketStatus = timeBucketsIterator.next(); String timeBucketStr = timeBucketStatus.getPath().getName(); if (timeBucketStr.equals(BucketsFileSystem.META_FILE_NAME) || timeBucketStr.endsWith(".tmp")) { //ignoring meta and tmp files continue; }/*from w w w. j a va 2s .co m*/ long timeBucket = Long.parseLong(timeBucketStr); if (timeBucket <= latestExpiredTimeBucket) { LOG.debug("deleting bucket {} time-bucket {}", timeBucket); invalidateTimeBucket(bucketName, timeBucket); delete(bucketName, timeBucketStatus.getPath().getName()); } else { emptyBucket = false; } } if (emptyBucket) { LOG.debug("deleting bucket {}", bucketName); deleteBucket(bucketName); } } }
From source file:org.apache.apex.malhar.lib.state.managed.ManagedStateTestUtils.java
License:Apache License
/** * Validates the bucket data on the File System. * @param fileAccess file access/* ww w.j av a 2 s.c o m*/ * @param bucketId bucket id * @param unsavedBucket bucket data to compare with. * @param keysPerTimeBucket num keys per time bucket * @throws IOException */ public static void validateBucketOnFileSystem(FileAccess fileAccess, long bucketId, Map<Slice, Bucket.BucketedValue> unsavedBucket, int keysPerTimeBucket) throws IOException { RemoteIterator<LocatedFileStatus> iterator = fileAccess.listFiles(bucketId); TreeMap<Slice, Slice> fromDisk = Maps.newTreeMap(new SliceComparator()); int size = 0; while (iterator.hasNext()) { LocatedFileStatus fileStatus = iterator.next(); String timeBucketStr = fileStatus.getPath().getName(); if (timeBucketStr.equals(BucketsFileSystem.META_FILE_NAME) || timeBucketStr.endsWith(".tmp")) { //ignoring meta file continue; } LOG.debug("bucket {} time-bucket {}", bucketId, timeBucketStr); FileAccess.FileReader reader = fileAccess.getReader(bucketId, timeBucketStr); reader.readFully(fromDisk); size += keysPerTimeBucket; Assert.assertEquals("size of bucket " + bucketId, size, fromDisk.size()); } Assert.assertEquals("size of bucket " + bucketId, unsavedBucket.size(), fromDisk.size()); Map<Slice, Slice> testBucket = Maps.transformValues(unsavedBucket, new Function<Bucket.BucketedValue, Slice>() { @Override public Slice apply(@Nullable Bucket.BucketedValue input) { assert input != null; return input.getValue(); } }); Assert.assertEquals("data of bucket" + bucketId, testBucket, fromDisk); }
From source file:org.apache.carbondata.hadoop.CarbonInputFormat.java
License:Apache License
private void getFileStatusOfSegments(JobContext job, String[] segmentsToConsider, List<FileStatus> result) throws IOException { String[] partitionsToConsider = getValidPartitions(job); if (partitionsToConsider.length == 0) { throw new IOException("No partitions/data found"); }//ww w. j a v a2 s .c o m PathFilter inputFilter = getDataFileFilter(job); CarbonTablePath tablePath = getTablePath(job.getConfiguration()); // get tokens for all the required FileSystem for table path TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { tablePath }, job.getConfiguration()); //get all data files of valid partitions and segments for (int i = 0; i < partitionsToConsider.length; ++i) { String partition = partitionsToConsider[i]; for (int j = 0; j < segmentsToConsider.length; ++j) { String segmentId = segmentsToConsider[j]; Path segmentPath = new Path(tablePath.getCarbonDataDirectoryPath(partition, segmentId)); FileSystem fs = segmentPath.getFileSystem(job.getConfiguration()); RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(segmentPath); while (iter.hasNext()) { LocatedFileStatus stat = iter.next(); if (inputFilter.accept(stat.getPath())) { if (stat.isDirectory()) { addInputPathRecursively(result, fs, stat.getPath(), inputFilter); } else { result.add(stat); } } } } } }
From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); }//from w w w . j a va2s . c om final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }
From source file:org.apache.druid.storage.hdfs.HdfsDataSegmentFinder.java
License:Apache License
@Override public Set<DataSegment> findSegments(String workingDirPathStr, boolean updateDescriptor) throws SegmentLoadingException { final Map<String, Pair<DataSegment, Long>> timestampedSegments = new HashMap<>(); final Path workingDirPath = new Path(workingDirPathStr); FileSystem fs;// ww w .j ava2 s . c o m try { fs = workingDirPath.getFileSystem(config); log.info(fs.getScheme()); log.info("FileSystem URI:" + fs.getUri().toString()); if (!fs.exists(workingDirPath)) { throw new SegmentLoadingException("Working directory [%s] doesn't exist.", workingDirPath); } if (!fs.isDirectory(workingDirPath)) { throw new SegmentLoadingException("Working directory [%s] is not a directory!?", workingDirPath); } final RemoteIterator<LocatedFileStatus> it = fs.listFiles(workingDirPath, true); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); final Path path = locatedFileStatus.getPath(); if (path.getName().endsWith("descriptor.json")) { // There are 3 supported path formats: // - hdfs://nn1/hdfs_base_directory/data_source_name/interval/version/shardNum/descriptor.json // - hdfs://nn1/hdfs_base_directory/data_source_name/interval/version/shardNum_descriptor.json // - hdfs://nn1/hdfs_base_directory/data_source_name/interval/version/shardNum_UUID_descriptor.json final String descriptorParts[] = path.getName().split("_"); Path indexZip = new Path(path.getParent(), "index.zip"); if (descriptorParts.length > 1) { Preconditions .checkState( descriptorParts.length <= 3 && org.apache.commons.lang.StringUtils.isNumeric(descriptorParts[0]) && "descriptor.json" .equals(descriptorParts[descriptorParts.length - 1]), "Unexpected descriptor filename format [%s]", path); indexZip = new Path(path.getParent(), StringUtils.format("%s_%sindex.zip", descriptorParts[0], descriptorParts.length == 2 ? "" : descriptorParts[1] + "_")); } if (fs.exists(indexZip)) { final DataSegment dataSegment = mapper.readValue(fs.open(path), DataSegment.class); log.info("Found segment [%s] located at [%s]", dataSegment.getIdentifier(), indexZip); final Map<String, Object> loadSpec = dataSegment.getLoadSpec(); final String pathWithoutScheme = indexZip.toUri().getPath(); if (!loadSpec.get("type").equals(HdfsStorageDruidModule.SCHEME) || !loadSpec.get("path").equals(pathWithoutScheme)) { loadSpec.put("type", HdfsStorageDruidModule.SCHEME); loadSpec.put("path", pathWithoutScheme); if (updateDescriptor) { log.info("Updating loadSpec in descriptor.json at [%s] with new path [%s]", path, pathWithoutScheme); mapper.writeValue(fs.create(path, true), dataSegment); } } DataSegmentFinder.putInMapRetainingNewest(timestampedSegments, dataSegment, locatedFileStatus.getModificationTime()); } else { throw new SegmentLoadingException( "index.zip didn't exist at [%s] while descripter.json exists!?", indexZip); } } } } catch (IOException e) { throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", workingDirPath); } return timestampedSegments.values().stream().map(x -> x.lhs).collect(Collectors.toSet()); }