Example usage for org.apache.hadoop.fs LocatedFileStatus getPath

Introduction

In this page you can find the example usage for org.apache.hadoop.fs LocatedFileStatus getPath.

Prototype

public Path getPath()

Source Link

Usage

From source file:io.druid.storage.hdfs.HdfsDataSegmentFinder.java

License:Apache License

@Override
public Set<DataSegment> findSegments(String workingDirPathStr, boolean updateDescriptor)
        throws SegmentLoadingException {
    final Set<DataSegment> segments = Sets.newHashSet();
    final Path workingDirPath = new Path(workingDirPathStr);
    FileSystem fs;/*from  w  w w .j  ava2 s .c om*/
    try {
        fs = workingDirPath.getFileSystem(config);

        log.info(fs.getScheme());
        log.info("FileSystem URI:" + fs.getUri().toString());

        if (!fs.exists(workingDirPath)) {
            throw new SegmentLoadingException("Working directory [%s] doesn't exist.", workingDirPath);
        }

        if (!fs.isDirectory(workingDirPath)) {
            throw new SegmentLoadingException("Working directory [%s] is not a directory!?", workingDirPath);
        }

        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(workingDirPath, true);
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            final Path path = locatedFileStatus.getPath();
            if (path.getName().endsWith("descriptor.json")) {
                final Path indexZip;
                final String descriptorParts[] = path.getName().split("_");
                if (descriptorParts.length == 2 && descriptorParts[1].equals("descriptor.json")
                        && org.apache.commons.lang.StringUtils.isNumeric(descriptorParts[0])) {
                    indexZip = new Path(path.getParent(),
                            StringUtils.format("%s_index.zip", descriptorParts[0]));
                } else {
                    indexZip = new Path(path.getParent(), "index.zip");
                }
                if (fs.exists(indexZip)) {
                    final DataSegment dataSegment = mapper.readValue(fs.open(path), DataSegment.class);
                    log.info("Found segment [%s] located at [%s]", dataSegment.getIdentifier(), indexZip);

                    final Map<String, Object> loadSpec = dataSegment.getLoadSpec();
                    final String pathWithoutScheme = indexZip.toUri().getPath();

                    if (!loadSpec.get("type").equals(HdfsStorageDruidModule.SCHEME)
                            || !loadSpec.get("path").equals(pathWithoutScheme)) {
                        loadSpec.put("type", HdfsStorageDruidModule.SCHEME);
                        loadSpec.put("path", pathWithoutScheme);
                        if (updateDescriptor) {
                            log.info("Updating loadSpec in descriptor.json at [%s] with new path [%s]", path,
                                    pathWithoutScheme);
                            mapper.writeValue(fs.create(path, true), dataSegment);
                        }
                    }
                    segments.add(dataSegment);
                } else {
                    throw new SegmentLoadingException(
                            "index.zip didn't exist at [%s] while descripter.json exists!?", indexZip);
                }
            }
        }
    } catch (IOException e) {
        throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", workingDirPath);
    }

    return segments;
}

From source file:io.druid.storage.hdfs.HdfsDataSegmentPuller.java

License:Apache License

public FileUtils.FileCopyResult getSegmentFiles(final Path path, final File outDir)
        throws SegmentLoadingException {
    final LocalFileSystem localFileSystem = new LocalFileSystem();
    try {//w w w  . j a va  2s.co  m
        final FileSystem fs = path.getFileSystem(config);
        if (fs.isDirectory(path)) {

            // --------    directory     ---------

            try {
                return RetryUtils.retry(new Callable<FileUtils.FileCopyResult>() {
                    @Override
                    public FileUtils.FileCopyResult call() throws Exception {
                        if (!fs.exists(path)) {
                            throw new SegmentLoadingException("No files found at [%s]", path.toString());
                        }

                        final RemoteIterator<LocatedFileStatus> children = fs.listFiles(path, false);
                        final ArrayList<FileUtils.FileCopyResult> localChildren = new ArrayList<>();
                        final FileUtils.FileCopyResult result = new FileUtils.FileCopyResult();
                        while (children.hasNext()) {
                            final LocatedFileStatus child = children.next();
                            final Path childPath = child.getPath();
                            final String fname = childPath.getName();
                            if (fs.isDirectory(childPath)) {
                                log.warn("[%s] is a child directory, skipping", childPath.toString());
                            } else {
                                final File outFile = new File(outDir, fname);

                                // Actual copy
                                fs.copyToLocalFile(childPath, new Path(outFile.toURI()));
                                result.addFile(outFile);
                            }
                        }
                        log.info("Copied %d bytes from [%s] to [%s]", result.size(), path.toString(),
                                outDir.getAbsolutePath());
                        return result;
                    }

                }, shouldRetryPredicate(), DEFAULT_RETRY_COUNT);
            } catch (Exception e) {
                throw Throwables.propagate(e);
            }
        } else if (CompressionUtils.isZip(path.getName())) {

            // --------    zip     ---------

            final FileUtils.FileCopyResult result = CompressionUtils.unzip(new ByteSource() {
                @Override
                public InputStream openStream() throws IOException {
                    return getInputStream(path);
                }
            }, outDir, shouldRetryPredicate(), false);

            log.info("Unzipped %d bytes from [%s] to [%s]", result.size(), path.toString(),
                    outDir.getAbsolutePath());

            return result;
        } else if (CompressionUtils.isGz(path.getName())) {

            // --------    gzip     ---------

            final String fname = path.getName();
            final File outFile = new File(outDir, CompressionUtils.getGzBaseName(fname));
            final FileUtils.FileCopyResult result = CompressionUtils.gunzip(new ByteSource() {
                @Override
                public InputStream openStream() throws IOException {
                    return getInputStream(path);
                }
            }, outFile);

            log.info("Gunzipped %d bytes from [%s] to [%s]", result.size(), path.toString(),
                    outFile.getAbsolutePath());
            return result;
        } else {
            throw new SegmentLoadingException("Do not know how to handle file type at [%s]", path.toString());
        }
    } catch (IOException e) {
        throw new SegmentLoadingException(e, "Error loading [%s]", path.toString());
    }
}

From source file:io.fluo.webindex.data.LoadHdfs.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        log.error("Usage: LoadHdfs <dataDir>");
        System.exit(1);/* www.  j av a  2s.c o m*/
    }
    final String dataDir = args[0];
    IndexEnv.validateDataDir(dataDir);

    final String hadoopConfDir = IndexEnv.getHadoopConfDir();
    final int rateLimit = DataConfig.load().getLoadRateLimit();

    List<String> loadPaths = new ArrayList<>();
    FileSystem hdfs = IndexEnv.getHDFS();
    RemoteIterator<LocatedFileStatus> listIter = hdfs.listFiles(new Path(dataDir), true);
    while (listIter.hasNext()) {
        LocatedFileStatus status = listIter.next();
        if (status.isFile()) {
            loadPaths.add(status.getPath().toString());
        }
    }

    log.info("Loading {} files into Fluo from {}", loadPaths.size(), dataDir);

    SparkConf sparkConf = new SparkConf().setAppName("webindex-load-hdfs");
    try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

        JavaRDD<String> paths = ctx.parallelize(loadPaths, loadPaths.size());

        paths.foreachPartition(iter -> {
            final FluoConfiguration fluoConfig = new FluoConfiguration(new File("fluo.properties"));
            final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null;
            FileSystem fs = IndexEnv.getHDFS(hadoopConfDir);
            try (FluoClient client = FluoFactory.newClient(fluoConfig);
                    LoaderExecutor le = client.newLoaderExecutor()) {
                iter.forEachRemaining(path -> {
                    Path filePath = new Path(path);
                    try {
                        if (fs.exists(filePath)) {
                            FSDataInputStream fsin = fs.open(filePath);
                            ArchiveReader reader = WARCReaderFactory.get(filePath.getName(), fsin, true);
                            for (ArchiveRecord record : reader) {
                                Page page = ArchiveUtil.buildPageIgnoreErrors(record);
                                if (page.getOutboundLinks().size() > 0) {
                                    log.info("Loading page {} with {} links", page.getUrl(),
                                            page.getOutboundLinks().size());
                                    if (rateLimiter != null) {
                                        rateLimiter.acquire();
                                    }
                                    le.execute(PageLoader.updatePage(page));
                                }
                            }
                        }
                    } catch (IOException e) {
                        log.error("Exception while processing {}", path, e);
                    }
                });
            }
        });
    }
}

From source file:io.prestosql.plugin.hive.util.HiveFileIterator.java

License:Apache License

@Override
protected LocatedFileStatus computeNext() {
    while (true) {
        while (remoteIterator.hasNext()) {
            LocatedFileStatus status = getLocatedFileStatus(remoteIterator);

            // Ignore hidden files and directories. Hive ignores files starting with _ and . as well.
            String fileName = status.getPath().getName();
            if (fileName.startsWith("_") || fileName.startsWith(".")) {
                continue;
            }//from   ww w .j a v  a  2  s  .co  m

            if (status.isDirectory()) {
                switch (nestedDirectoryPolicy) {
                case IGNORED:
                    continue;
                case RECURSE:
                    paths.add(status.getPath());
                    continue;
                case FAIL:
                    throw new NestedDirectoryNotAllowedException();
                }
            }

            return status;
        }

        if (paths.isEmpty()) {
            return endOfData();
        }
        remoteIterator = getLocatedFileStatusRemoteIterator(paths.removeFirst());
    }
}

From source file:nl.kpmg.lcm.server.data.hdfs.HdfsFileSystemAdapter.java

License:Apache License

@Override
public List listFileNames(String subPath) throws IOException {

    Configuration conf = new Configuration();
    conf.set("fs.defaultFS", storage.getUrl());
    FileSystem hdfs = FileSystem.get(conf);
    String storagePath = "/" + storage.getPath() + "/" + subPath;
    Path filePath = new Path(storagePath);
    if (!hdfs.exists(filePath)) {
        return null;
    }//from w w w .  jav a2s. c o m

    RemoteIterator<LocatedFileStatus> fileList = hdfs.listFiles(filePath, false);
    LinkedList<String> fileNameList = new LinkedList();

    while (fileList.hasNext()) {
        LocatedFileStatus fileStatus = fileList.next();
        fileNameList.add(fileStatus.getPath().getName());
    }

    return fileNameList;
}

From source file:org.apache.apex.malhar.lib.state.managed.BucketsFileSystem.java

License:Apache License

protected void deleteTimeBucketsLessThanEqualTo(long latestExpiredTimeBucket) throws IOException {
    LOG.debug("delete files before {}", latestExpiredTimeBucket);

    for (long bucketName : bucketNamesOnFS) {
        RemoteIterator<LocatedFileStatus> timeBucketsIterator = listFiles(bucketName);
        boolean emptyBucket = true;
        while (timeBucketsIterator.hasNext()) {
            LocatedFileStatus timeBucketStatus = timeBucketsIterator.next();

            String timeBucketStr = timeBucketStatus.getPath().getName();
            if (timeBucketStr.equals(BucketsFileSystem.META_FILE_NAME) || timeBucketStr.endsWith(".tmp")) {
                //ignoring meta and tmp files
                continue;
            }/*from   w  w w.  j  a va 2s  .co m*/
            long timeBucket = Long.parseLong(timeBucketStr);

            if (timeBucket <= latestExpiredTimeBucket) {
                LOG.debug("deleting bucket {} time-bucket {}", timeBucket);
                invalidateTimeBucket(bucketName, timeBucket);
                delete(bucketName, timeBucketStatus.getPath().getName());
            } else {
                emptyBucket = false;
            }
        }
        if (emptyBucket) {
            LOG.debug("deleting bucket {}", bucketName);
            deleteBucket(bucketName);
        }
    }
}

From source file:org.apache.apex.malhar.lib.state.managed.ManagedStateTestUtils.java

License:Apache License

/**
 * Validates the bucket data on the File System.
 * @param fileAccess        file access/* ww w.j  av a 2  s.c  o m*/
 * @param bucketId          bucket id
 * @param unsavedBucket     bucket data to compare with.
 * @param keysPerTimeBucket num keys per time bucket
 * @throws IOException
 */
public static void validateBucketOnFileSystem(FileAccess fileAccess, long bucketId,
        Map<Slice, Bucket.BucketedValue> unsavedBucket, int keysPerTimeBucket) throws IOException {
    RemoteIterator<LocatedFileStatus> iterator = fileAccess.listFiles(bucketId);
    TreeMap<Slice, Slice> fromDisk = Maps.newTreeMap(new SliceComparator());
    int size = 0;
    while (iterator.hasNext()) {
        LocatedFileStatus fileStatus = iterator.next();

        String timeBucketStr = fileStatus.getPath().getName();
        if (timeBucketStr.equals(BucketsFileSystem.META_FILE_NAME) || timeBucketStr.endsWith(".tmp")) {
            //ignoring meta file
            continue;
        }

        LOG.debug("bucket {} time-bucket {}", bucketId, timeBucketStr);

        FileAccess.FileReader reader = fileAccess.getReader(bucketId, timeBucketStr);

        reader.readFully(fromDisk);
        size += keysPerTimeBucket;
        Assert.assertEquals("size of bucket " + bucketId, size, fromDisk.size());
    }

    Assert.assertEquals("size of bucket " + bucketId, unsavedBucket.size(), fromDisk.size());

    Map<Slice, Slice> testBucket = Maps.transformValues(unsavedBucket,
            new Function<Bucket.BucketedValue, Slice>() {
                @Override
                public Slice apply(@Nullable Bucket.BucketedValue input) {
                    assert input != null;
                    return input.getValue();
                }
            });
    Assert.assertEquals("data of bucket" + bucketId, testBucket, fromDisk);
}

From source file:org.apache.carbondata.hadoop.CarbonInputFormat.java

License:Apache License

private void getFileStatusOfSegments(JobContext job, String[] segmentsToConsider, List<FileStatus> result)
        throws IOException {
    String[] partitionsToConsider = getValidPartitions(job);
    if (partitionsToConsider.length == 0) {
        throw new IOException("No partitions/data found");
    }//ww w.  j a v a2 s .c  o  m

    PathFilter inputFilter = getDataFileFilter(job);
    CarbonTablePath tablePath = getTablePath(job.getConfiguration());

    // get tokens for all the required FileSystem for table path
    TokenCache.obtainTokensForNamenodes(job.getCredentials(), new Path[] { tablePath }, job.getConfiguration());

    //get all data files of valid partitions and segments
    for (int i = 0; i < partitionsToConsider.length; ++i) {
        String partition = partitionsToConsider[i];

        for (int j = 0; j < segmentsToConsider.length; ++j) {
            String segmentId = segmentsToConsider[j];
            Path segmentPath = new Path(tablePath.getCarbonDataDirectoryPath(partition, segmentId));
            FileSystem fs = segmentPath.getFileSystem(job.getConfiguration());

            RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(segmentPath);
            while (iter.hasNext()) {
                LocatedFileStatus stat = iter.next();
                if (inputFilter.accept(stat.getPath())) {
                    if (stat.isDirectory()) {
                        addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                    } else {
                        result.add(stat);
                    }
                }
            }
        }
    }
}

From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java

License:Apache License

public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
        jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }//from w  w w . j  a va2s  . c om
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
        throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
        jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
            JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
            job);

    Throwable throwable = null;
    try {
        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
        final boolean success = job.waitForCompletion(true);
        if (!success) {
            final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
            if (reports != null) {
                for (final TaskReport report : reports) {
                    log.error("Error in task [%s] : %s", report.getTaskId(),
                            Arrays.toString(report.getDiagnostics()));
                }
            }
            return null;
        }
        try {
            loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
            writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
        } catch (IOException ex) {
            log.error(ex, "Could not fetch counters");
        }
        final JobID jobID = job.getJobID();

        final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
        final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
        final List<Path> goodPaths = new ArrayList<>();
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            if (locatedFileStatus.isFile()) {
                final Path myPath = locatedFileStatus.getPath();
                if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
                    goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
                }
            }
        }
        if (goodPaths.isEmpty()) {
            log.warn("No good data found at [%s]", jobDir);
            return null;
        }
        final List<DataSegment> returnList = ImmutableList
                .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                        try {
                            if (!fs.exists(input)) {
                                throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]",
                                        ConvertingOutputFormat.DATA_SUCCESS_KEY,
                                        ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
                            }
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                        try (final InputStream stream = fs.open(input)) {
                            return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                    }
                }));
        if (returnList.size() == segments.size()) {
            return returnList;
        } else {
            throw new ISE(
                    "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
                    segments.size(), returnList.size(), jobDir);
        }
    } catch (InterruptedException | ClassNotFoundException e) {
        RuntimeException exception = Throwables.propagate(e);
        throwable = exception;
        throw exception;
    } catch (Throwable t) {
        throwable = t;
        throw t;
    } finally {
        try {
            cleanup(job);
        } catch (IOException e) {
            if (throwable != null) {
                throwable.addSuppressed(e);
            } else {
                log.error(e, "Could not clean up job [%s]", job.getJobID());
            }
        }
    }
}

From source file:org.apache.druid.storage.hdfs.HdfsDataSegmentFinder.java

License:Apache License

@Override
public Set<DataSegment> findSegments(String workingDirPathStr, boolean updateDescriptor)
        throws SegmentLoadingException {
    final Map<String, Pair<DataSegment, Long>> timestampedSegments = new HashMap<>();
    final Path workingDirPath = new Path(workingDirPathStr);
    FileSystem fs;// ww  w .j ava2  s . c o m
    try {
        fs = workingDirPath.getFileSystem(config);

        log.info(fs.getScheme());
        log.info("FileSystem URI:" + fs.getUri().toString());

        if (!fs.exists(workingDirPath)) {
            throw new SegmentLoadingException("Working directory [%s] doesn't exist.", workingDirPath);
        }

        if (!fs.isDirectory(workingDirPath)) {
            throw new SegmentLoadingException("Working directory [%s] is not a directory!?", workingDirPath);
        }

        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(workingDirPath, true);
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            final Path path = locatedFileStatus.getPath();
            if (path.getName().endsWith("descriptor.json")) {

                // There are 3 supported path formats:
                //    - hdfs://nn1/hdfs_base_directory/data_source_name/interval/version/shardNum/descriptor.json
                //    - hdfs://nn1/hdfs_base_directory/data_source_name/interval/version/shardNum_descriptor.json
                //    - hdfs://nn1/hdfs_base_directory/data_source_name/interval/version/shardNum_UUID_descriptor.json
                final String descriptorParts[] = path.getName().split("_");

                Path indexZip = new Path(path.getParent(), "index.zip");
                if (descriptorParts.length > 1) {
                    Preconditions
                            .checkState(
                                    descriptorParts.length <= 3
                                            && org.apache.commons.lang.StringUtils.isNumeric(descriptorParts[0])
                                            && "descriptor.json"
                                                    .equals(descriptorParts[descriptorParts.length - 1]),
                                    "Unexpected descriptor filename format [%s]", path);

                    indexZip = new Path(path.getParent(), StringUtils.format("%s_%sindex.zip",
                            descriptorParts[0], descriptorParts.length == 2 ? "" : descriptorParts[1] + "_"));
                }

                if (fs.exists(indexZip)) {
                    final DataSegment dataSegment = mapper.readValue(fs.open(path), DataSegment.class);
                    log.info("Found segment [%s] located at [%s]", dataSegment.getIdentifier(), indexZip);

                    final Map<String, Object> loadSpec = dataSegment.getLoadSpec();
                    final String pathWithoutScheme = indexZip.toUri().getPath();

                    if (!loadSpec.get("type").equals(HdfsStorageDruidModule.SCHEME)
                            || !loadSpec.get("path").equals(pathWithoutScheme)) {
                        loadSpec.put("type", HdfsStorageDruidModule.SCHEME);
                        loadSpec.put("path", pathWithoutScheme);
                        if (updateDescriptor) {
                            log.info("Updating loadSpec in descriptor.json at [%s] with new path [%s]", path,
                                    pathWithoutScheme);
                            mapper.writeValue(fs.create(path, true), dataSegment);
                        }
                    }

                    DataSegmentFinder.putInMapRetainingNewest(timestampedSegments, dataSegment,
                            locatedFileStatus.getModificationTime());
                } else {
                    throw new SegmentLoadingException(
                            "index.zip didn't exist at [%s] while descripter.json exists!?", indexZip);
                }
            }
        }
    } catch (IOException e) {
        throw new SegmentLoadingException(e, "Problems interacting with filesystem[%s].", workingDirPath);
    }

    return timestampedSegments.values().stream().map(x -> x.lhs).collect(Collectors.toSet());
}