Example usage for org.apache.hadoop.fs FileSystem listFiles

List of usage examples for org.apache.hadoop.fs FileSystem listFiles

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem listFiles.

Prototype

public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive)
        throws FileNotFoundException, IOException 

Source Link

Document

List the statuses and block locations of the files in the given path.

Usage

From source file:com.github.sakserv.storm.KafkaHiveHdfsTopologyTest.java

License:Apache License

public void validateHdfsResults() throws IOException {
    System.out.println("HDFS: VALIDATING");
    FileSystem hdfsFsHandle = hdfsCluster.getHdfsFileSystemHandle();
    RemoteIterator<LocatedFileStatus> listFiles = hdfsFsHandle.listFiles(new Path("/tmp/kafka_data"), true);
    while (listFiles.hasNext()) {
        LocatedFileStatus file = listFiles.next();

        System.out.println("HDFS READ: Found File: " + file);

        BufferedReader br = new BufferedReader(new InputStreamReader(hdfsFsHandle.open(file.getPath())));
        String line = br.readLine();
        while (line != null) {
            System.out.println("HDFS READ: Found Line: " + line);
            line = br.readLine();/* w  w  w  . j a  v a 2s  . c  o m*/
        }
    }
    hdfsFsHandle.close();
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

void enumerateDir() throws Exception {
    System.out.println("enumarate dir, path " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    int repfactor = 4;
    for (int k = 0; k < repfactor; k++) {
        long start = System.currentTimeMillis();
        for (int i = 0; i < size; i++) {
            // single operation == loop
            RemoteIterator<LocatedFileStatus> iter = fs.listFiles(path, false);
            while (iter.hasNext()) {
                iter.next();//from   w  w w  .j  a v  a 2 s  .com
            }
        }
        long end = System.currentTimeMillis();
        double executionTime = ((double) (end - start));
        double latency = executionTime * 1000.0 / ((double) size);
        System.out.println("execution time [ms] " + executionTime);
        System.out.println("latency [us] " + latency);
    }
    fs.close();
}

From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java

License:Apache License

void browseDir() throws Exception {
    System.out.println("reading enumarate dir, path " + path);
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    //benchmark//from ww w.jav a  2 s.  c  o  m
    System.out.println("starting benchmark...");
    RemoteIterator<LocatedFileStatus> iter = fs.listFiles(path, false);
    while (iter.hasNext()) {
        LocatedFileStatus status = iter.next();
        System.out.println(status.getPath());
    }
    fs.close();
}

From source file:com.intel.hibench.datagen.streaming.util.SourceFileReader.java

License:Apache License

static private InputStream openMultipleParts(FileSystem fs, Path pt, long offset) throws IOException {

    System.out.println("opening all parts in path: " + pt + ", from offset: " + offset);
    // list all files in given path
    RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false);
    Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>();
    while (rit.hasNext()) {
        Path path = rit.next().getPath();

        // Only read those files start with "part-"
        if (path.getName().startsWith("part-")) {
            long fileSize = fs.getFileStatus(path).getLen();
            if (offset < fileSize) {
                FSDataInputStream inputStream = fs.open(path);
                if (offset > 0) {
                    inputStream.seek(offset);
                }//  w w  w . j  ava  2s  .  c o  m
                fileHandleList.add(inputStream);
            }
            offset -= fileSize;
        }
    }

    if (!fileHandleList.isEmpty()) {
        return new SequenceInputStream(fileHandleList.elements());
    } else {
        System.err.println("Error, no source file loaded. run genSeedDataset.sh first!");
        return null;
    }

}

From source file:com.intel.hibench.streambench.FileDataGenNew.java

License:Apache License

private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException {
    System.out.println("Opening files, path:" + pt + " offset:" + offset);
    RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false);
    Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>();
    while (rit.hasNext()) {
        Path path = rit.next().getPath();
        String filename = path.toString().substring(path.getParent().toString().length(),
                path.toString().length());

        if (filename.startsWith("/part-")) {
            long filesize = fs.getFileStatus(path).getLen();
            if (offset < filesize) {
                FSDataInputStream handle = fs.open(path);
                if (offset > 0) {
                    handle.seek(offset);
                }/*  www.  j a v a2 s.c  om*/
                fileHandleList.add(handle);
            }
            offset -= filesize;
        }
    }
    if (fileHandleList.size() == 1)
        return fileHandleList.get(0);
    else if (fileHandleList.size() > 1) {
        Enumeration<FSDataInputStream> enu = fileHandleList.elements();
        return new SequenceInputStream(enu);
    } else {
        System.err.println("Error, no source file loaded. run genSeedDataset.sh first!");
        return null;
    }
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

@Override
public void open(Configuration parameters) throws Exception {
    super.open(parameters);

    subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();

    state = new State<T>();

    Path baseDirectory = new Path(basePath);
    hadoopConf = HadoopFileSystem.getHadoopConfiguration();
    FileSystem fs = baseDirectory.getFileSystem(hadoopConf);
    refTruncate = reflectTruncate(fs);//from ww w . j ava 2  s . c  om

    long currentProcessingTime = ((StreamingRuntimeContext) getRuntimeContext()).getCurrentProcessingTime();

    checkForInactiveBuckets(currentProcessingTime);

    ((StreamingRuntimeContext) getRuntimeContext())
            .registerTimer(currentProcessingTime + inactiveBucketCheckInterval, this);

    this.clock = new Clock() {
        @Override
        public long currentTimeMillis() {
            return ((StreamingRuntimeContext) getRuntimeContext()).getCurrentProcessingTime();
        }
    };

    // delete pending/in-progress files that might be left if we fail while
    // no checkpoint has yet been done
    try {
        if (fs.exists(baseDirectory) && cleanupOnOpen) {
            RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(baseDirectory, true);

            while (bucketFiles.hasNext()) {
                LocatedFileStatus file = bucketFiles.next();
                if (file.getPath().toString().endsWith(pendingSuffix)) {
                    // only delete files that contain our subtask index
                    if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                        LOG.debug("(OPEN) Deleting leftover pending file {}", file.getPath().toString());
                        fs.delete(file.getPath(), true);
                    }
                }
                if (file.getPath().toString().endsWith(inProgressSuffix)) {
                    // only delete files that contain our subtask index
                    if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                        LOG.debug("(OPEN) Deleting leftover in-progress file {}", file.getPath().toString());
                        fs.delete(file.getPath(), true);
                    }
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Error while deleting leftover pending/in-progress files: {}", e);
        throw new RuntimeException("Error while deleting leftover pending/in-progress files.", e);
    }
}

From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java

License:Apache License

@Override
public void restoreState(State<T> state) {
    this.state = state;

    FileSystem fs;
    try {//from   w w  w.j a v a2 s . c  o m
        fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration());
    } catch (IOException e) {
        LOG.error("Error while creating FileSystem in checkpoint restore.", e);
        throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e);
    }

    for (BucketState<T> bucketState : state.bucketStates.values()) {
        // we can clean all the pending files since they where renamed to final files
        // after this checkpoint was successful
        bucketState.pendingFiles.clear();

        if (bucketState.currentFile != null) {
            // We were writing to a file when the last checkpoint occured. This file can either
            // be still in-progress or became a pending file at some point after the checkpoint.
            // Either way, we have to truncate it back to a valid state (or write a .valid-length)
            // file that specifies up to which length it is valid and rename it to the final name
            // before starting a new bucket file.
            Path partPath = new Path(bucketState.currentFile);
            try {
                Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName())
                        .suffix(pendingSuffix);
                Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName())
                        .suffix(inProgressSuffix);

                if (fs.exists(partPendingPath)) {
                    LOG.debug(
                            "In-progress file {} has been moved to pending after checkpoint, moving to final location.",
                            partPath);
                    // has been moved to pending in the mean time, rename to final location
                    fs.rename(partPendingPath, partPath);
                } else if (fs.exists(partInProgressPath)) {
                    LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath);
                    // it was still in progress, rename to final path
                    fs.rename(partInProgressPath, partPath);
                } else if (fs.exists(partPath)) {
                    LOG.debug("In-Progress file {} was already moved to final location {}.",
                            bucketState.currentFile, partPath);
                } else {
                    LOG.debug(
                            "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, "
                                    + "it was moved to final location by a previous snapshot restore",
                            bucketState.currentFile);
                }

                refTruncate = reflectTruncate(fs);
                // truncate it or write a ".valid-length" file to specify up to which point it is valid
                if (refTruncate != null) {
                    LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength);
                    // some-one else might still hold the lease from a previous try, we are
                    // recovering, after all ...
                    if (fs instanceof DistributedFileSystem) {
                        DistributedFileSystem dfs = (DistributedFileSystem) fs;
                        LOG.debug("Trying to recover file lease {}", partPath);
                        dfs.recoverLease(partPath);
                        boolean isclosed = dfs.isFileClosed(partPath);
                        StopWatch sw = new StopWatch();
                        sw.start();
                        while (!isclosed) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            isclosed = dfs.isFileClosed(partPath);
                        }
                    }
                    Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath,
                            bucketState.currentFileValidLength);
                    if (!truncated) {
                        LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath);

                        // we must wait for the asynchronous truncate operation to complete
                        StopWatch sw = new StopWatch();
                        sw.start();
                        long newLen = fs.getFileStatus(partPath).getLen();
                        while (newLen != bucketState.currentFileValidLength) {
                            if (sw.getTime() > asyncTimeout) {
                                break;
                            }
                            try {
                                Thread.sleep(500);
                            } catch (InterruptedException e1) {
                                // ignore it
                            }
                            newLen = fs.getFileStatus(partPath).getLen();
                        }
                        if (newLen != bucketState.currentFileValidLength) {
                            throw new RuntimeException("Truncate did not truncate to right length. Should be "
                                    + bucketState.currentFileValidLength + " is " + newLen + ".");
                        }
                    }

                } else {
                    LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath,
                            bucketState.currentFileValidLength);
                    Path validLengthFilePath = new Path(partPath.getParent(),
                            validLengthPrefix + partPath.getName()).suffix(validLengthSuffix);
                    if (!fs.exists(validLengthFilePath)) {
                        FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath);
                        lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength));
                        lengthFileOut.close();
                    }
                }

                // Now that we've restored the bucket to a valid state, reset the current file info
                bucketState.currentFile = null;
                bucketState.currentFileValidLength = -1;
            } catch (IOException e) {
                LOG.error("Error while restoring BucketingSink state.", e);
                throw new RuntimeException("Error while restoring BucketingSink state.", e);
            } catch (InvocationTargetException | IllegalAccessException e) {
                LOG.error("Cound not invoke truncate.", e);
                throw new RuntimeException("Could not invoke truncate.", e);
            }
        }

        LOG.debug("Clearing pending/in-progress files.");

        // Move files that are confirmed by a checkpoint but did not get moved to final location
        // because the checkpoint notification did not happen before a failure

        Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet();
        LOG.debug("Moving pending files to final location on restore.");
        for (Long pastCheckpointId : pastCheckpointIds) {
            // All the pending files are buckets that have been completed but are waiting to be renamed
            // to their final name
            for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) {
                Path finalPath = new Path(filename);
                Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName())
                        .suffix(pendingSuffix);

                try {
                    if (fs.exists(pendingPath)) {
                        LOG.debug(
                                "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.",
                                pendingPath, pastCheckpointId);
                        fs.rename(pendingPath, finalPath);
                    }
                } catch (IOException e) {
                    LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}",
                            pendingPath, finalPath, e);
                    throw new RuntimeException(
                            "Error while renaming pending file " + pendingPath + " to final path " + finalPath,
                            e);
                }
            }
        }

        synchronized (bucketState.pendingFilesPerCheckpoint) {
            bucketState.pendingFilesPerCheckpoint.clear();
        }
    }

    // we need to get this here since open() has not yet been called
    int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask();
    // delete pending files
    try {

        RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true);

        while (bucketFiles.hasNext()) {
            LocatedFileStatus file = bucketFiles.next();
            if (file.getPath().toString().endsWith(pendingSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
            if (file.getPath().toString().endsWith(inProgressSuffix)) {
                // only delete files that contain our subtask index
                if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) {
                    LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString());
                    fs.delete(file.getPath(), true);
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Error while deleting old pending files: {}", e);
        throw new RuntimeException("Error while deleting old pending files.", e);
    }
}

From source file:com.netflix.bdp.s3.S3PartitionedOutputCommitter.java

License:Apache License

@Override
protected List<FileStatus> getTaskOutput(TaskAttemptContext context) throws IOException {
    PathFilter filter = HiddenPathFilter.get();

    // get files on the local FS in the attempt path
    Path attemptPath = getTaskAttemptPath(context);
    FileSystem attemptFS = attemptPath.getFileSystem(context.getConfiguration());
    RemoteIterator<LocatedFileStatus> iter = attemptFS.listFiles(attemptPath, true /* recursive */ );

    List<FileStatus> stats = Lists.newArrayList();
    while (iter.hasNext()) {
        FileStatus stat = iter.next();//from   www. ja  v a  2 s.c o m
        if (filter.accept(stat.getPath())) {
            stats.add(stat);
        }
    }

    return stats;
}

From source file:com.simiacryptus.mindseye.applications.HadoopUtil.java

License:Apache License

/**
 * Gets files./*from w  w  w  . j  a  v a2  s . co m*/
 *
 * @param file the file
 * @return the files
 */
public static List<CharSequence> getFiles(CharSequence file) {
    try {
        FileSystem fileSystem = getFileSystem(file);
        Path path = new Path(file.toString());
        if (!fileSystem.exists(path))
            throw new IllegalStateException(path + " does not exist");
        List<CharSequence> collect = toStream(fileSystem.listFiles(path, false)).map(FileStatus::getPath)
                .map(Path::toString).collect(Collectors.toList());
        collect.stream().forEach(child -> {
            try {
                if (!fileSystem.exists(new Path(child.toString())))
                    throw new IllegalStateException(child + " does not exist");
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        });
        return collect;
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
}

From source file:com.splicemachine.tutorials.vti.ORCRecordIterator.java

License:Apache License

/**
 * Constructor performs initialization and sets the record iterator
 * /*from   ww w. ja va  2  s .c o  m*/
 * @param filesystem
 *            : HDFS File System
 * @param filePath
 *            : File or folder path in HDFS
 * @param execRow
 *            : Format of the result record
 * @throws IOException
 */
public ORCRecordIterator(FileSystem filesystem, Path filePath, ExecRow execRow) {
    // set the instance variable of filesystem to use later
    this.filesystem = filesystem;

    // Set the instance variable of the result record format to be used
    // later
    this.execRow = execRow;

    try {
        Path curFiletoProcess = null;
        Reader reader;

        // Check if filePath specifes afile or folder
        // If its folder, set the flag, and get the first file in the folder
        if (filesystem.isDirectory(filePath)) {
            isDir = true;
            this.fileList = filesystem.listFiles(filePath, false);
            curFiletoProcess = fileList.next().getPath();
        } else {

            curFiletoProcess = filePath;
        }

        // Get the reader for the single file (first file in case of folder)
        reader = getReader(curFiletoProcess);

        // Get the inspector for the format of the record in the ORC File
        this.inspector = (StructObjectInspector) reader.getObjectInspector();

        // Retrieve the Records from reader to process
        records = reader.rows();

    } catch (Exception e) {
        try {
            if (records != null)
                records.close();
        } catch (Exception cE) {
            throw new RuntimeException(cE);
        }
        throw new RuntimeException(e);
    }

}