List of usage examples for org.apache.hadoop.fs FileSystem listFiles
public RemoteIterator<LocatedFileStatus> listFiles(final Path f, final boolean recursive) throws FileNotFoundException, IOException
From source file:com.github.sakserv.storm.KafkaHiveHdfsTopologyTest.java
License:Apache License
public void validateHdfsResults() throws IOException { System.out.println("HDFS: VALIDATING"); FileSystem hdfsFsHandle = hdfsCluster.getHdfsFileSystemHandle(); RemoteIterator<LocatedFileStatus> listFiles = hdfsFsHandle.listFiles(new Path("/tmp/kafka_data"), true); while (listFiles.hasNext()) { LocatedFileStatus file = listFiles.next(); System.out.println("HDFS READ: Found File: " + file); BufferedReader br = new BufferedReader(new InputStreamReader(hdfsFsHandle.open(file.getPath()))); String line = br.readLine(); while (line != null) { System.out.println("HDFS READ: Found Line: " + line); line = br.readLine();/* w w w . j a v a 2s . c o m*/ } } hdfsFsHandle.close(); }
From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java
License:Apache License
void enumerateDir() throws Exception { System.out.println("enumarate dir, path " + path); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); int repfactor = 4; for (int k = 0; k < repfactor; k++) { long start = System.currentTimeMillis(); for (int i = 0; i < size; i++) { // single operation == loop RemoteIterator<LocatedFileStatus> iter = fs.listFiles(path, false); while (iter.hasNext()) { iter.next();//from w w w .j a v a 2 s .com } } long end = System.currentTimeMillis(); double executionTime = ((double) (end - start)); double latency = executionTime * 1000.0 / ((double) size); System.out.println("execution time [ms] " + executionTime); System.out.println("latency [us] " + latency); } fs.close(); }
From source file:com.ibm.crail.hdfs.tools.HdfsIOBenchmark.java
License:Apache License
void browseDir() throws Exception { System.out.println("reading enumarate dir, path " + path); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); //benchmark//from ww w.jav a 2 s. c o m System.out.println("starting benchmark..."); RemoteIterator<LocatedFileStatus> iter = fs.listFiles(path, false); while (iter.hasNext()) { LocatedFileStatus status = iter.next(); System.out.println(status.getPath()); } fs.close(); }
From source file:com.intel.hibench.datagen.streaming.util.SourceFileReader.java
License:Apache License
static private InputStream openMultipleParts(FileSystem fs, Path pt, long offset) throws IOException { System.out.println("opening all parts in path: " + pt + ", from offset: " + offset); // list all files in given path RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); // Only read those files start with "part-" if (path.getName().startsWith("part-")) { long fileSize = fs.getFileStatus(path).getLen(); if (offset < fileSize) { FSDataInputStream inputStream = fs.open(path); if (offset > 0) { inputStream.seek(offset); }// w w w . j ava 2s . c o m fileHandleList.add(inputStream); } offset -= fileSize; } } if (!fileHandleList.isEmpty()) { return new SequenceInputStream(fileHandleList.elements()); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh first!"); return null; } }
From source file:com.intel.hibench.streambench.FileDataGenNew.java
License:Apache License
private InputStream OpenMultiplePartsWithOffset(FileSystem fs, Path pt, long offset) throws IOException { System.out.println("Opening files, path:" + pt + " offset:" + offset); RemoteIterator<LocatedFileStatus> rit = fs.listFiles(pt, false); Vector<FSDataInputStream> fileHandleList = new Vector<FSDataInputStream>(); while (rit.hasNext()) { Path path = rit.next().getPath(); String filename = path.toString().substring(path.getParent().toString().length(), path.toString().length()); if (filename.startsWith("/part-")) { long filesize = fs.getFileStatus(path).getLen(); if (offset < filesize) { FSDataInputStream handle = fs.open(path); if (offset > 0) { handle.seek(offset); }/* www. j a v a2 s.c om*/ fileHandleList.add(handle); } offset -= filesize; } } if (fileHandleList.size() == 1) return fileHandleList.get(0); else if (fileHandleList.size() > 1) { Enumeration<FSDataInputStream> enu = fileHandleList.elements(); return new SequenceInputStream(enu); } else { System.err.println("Error, no source file loaded. run genSeedDataset.sh first!"); return null; } }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
@Override public void open(Configuration parameters) throws Exception { super.open(parameters); subtaskIndex = getRuntimeContext().getIndexOfThisSubtask(); state = new State<T>(); Path baseDirectory = new Path(basePath); hadoopConf = HadoopFileSystem.getHadoopConfiguration(); FileSystem fs = baseDirectory.getFileSystem(hadoopConf); refTruncate = reflectTruncate(fs);//from ww w . j ava 2 s . c om long currentProcessingTime = ((StreamingRuntimeContext) getRuntimeContext()).getCurrentProcessingTime(); checkForInactiveBuckets(currentProcessingTime); ((StreamingRuntimeContext) getRuntimeContext()) .registerTimer(currentProcessingTime + inactiveBucketCheckInterval, this); this.clock = new Clock() { @Override public long currentTimeMillis() { return ((StreamingRuntimeContext) getRuntimeContext()).getCurrentProcessingTime(); } }; // delete pending/in-progress files that might be left if we fail while // no checkpoint has yet been done try { if (fs.exists(baseDirectory) && cleanupOnOpen) { RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(baseDirectory, true); while (bucketFiles.hasNext()) { LocatedFileStatus file = bucketFiles.next(); if (file.getPath().toString().endsWith(pendingSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(OPEN) Deleting leftover pending file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } if (file.getPath().toString().endsWith(inProgressSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(OPEN) Deleting leftover in-progress file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } } } } catch (IOException e) { LOG.error("Error while deleting leftover pending/in-progress files: {}", e); throw new RuntimeException("Error while deleting leftover pending/in-progress files.", e); } }
From source file:com.mvad.flink.demo.streaming.lib.sink.bucketing.BucketingSink.java
License:Apache License
@Override public void restoreState(State<T> state) { this.state = state; FileSystem fs; try {//from w w w.j a v a2 s . c o m fs = new Path(basePath).getFileSystem(HadoopFileSystem.getHadoopConfiguration()); } catch (IOException e) { LOG.error("Error while creating FileSystem in checkpoint restore.", e); throw new RuntimeException("Error while creating FileSystem in checkpoint restore.", e); } for (BucketState<T> bucketState : state.bucketStates.values()) { // we can clean all the pending files since they where renamed to final files // after this checkpoint was successful bucketState.pendingFiles.clear(); if (bucketState.currentFile != null) { // We were writing to a file when the last checkpoint occured. This file can either // be still in-progress or became a pending file at some point after the checkpoint. // Either way, we have to truncate it back to a valid state (or write a .valid-length) // file that specifies up to which length it is valid and rename it to the final name // before starting a new bucket file. Path partPath = new Path(bucketState.currentFile); try { Path partPendingPath = new Path(partPath.getParent(), pendingPrefix + partPath.getName()) .suffix(pendingSuffix); Path partInProgressPath = new Path(partPath.getParent(), inProgressPrefix + partPath.getName()) .suffix(inProgressSuffix); if (fs.exists(partPendingPath)) { LOG.debug( "In-progress file {} has been moved to pending after checkpoint, moving to final location.", partPath); // has been moved to pending in the mean time, rename to final location fs.rename(partPendingPath, partPath); } else if (fs.exists(partInProgressPath)) { LOG.debug("In-progress file {} is still in-progress, moving to final location.", partPath); // it was still in progress, rename to final path fs.rename(partInProgressPath, partPath); } else if (fs.exists(partPath)) { LOG.debug("In-Progress file {} was already moved to final location {}.", bucketState.currentFile, partPath); } else { LOG.debug( "In-Progress file {} was neither moved to pending nor is still in progress. Possibly, " + "it was moved to final location by a previous snapshot restore", bucketState.currentFile); } refTruncate = reflectTruncate(fs); // truncate it or write a ".valid-length" file to specify up to which point it is valid if (refTruncate != null) { LOG.debug("Truncating {} to valid length {}", partPath, bucketState.currentFileValidLength); // some-one else might still hold the lease from a previous try, we are // recovering, after all ... if (fs instanceof DistributedFileSystem) { DistributedFileSystem dfs = (DistributedFileSystem) fs; LOG.debug("Trying to recover file lease {}", partPath); dfs.recoverLease(partPath); boolean isclosed = dfs.isFileClosed(partPath); StopWatch sw = new StopWatch(); sw.start(); while (!isclosed) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } isclosed = dfs.isFileClosed(partPath); } } Boolean truncated = (Boolean) refTruncate.invoke(fs, partPath, bucketState.currentFileValidLength); if (!truncated) { LOG.debug("Truncate did not immediately complete for {}, waiting...", partPath); // we must wait for the asynchronous truncate operation to complete StopWatch sw = new StopWatch(); sw.start(); long newLen = fs.getFileStatus(partPath).getLen(); while (newLen != bucketState.currentFileValidLength) { if (sw.getTime() > asyncTimeout) { break; } try { Thread.sleep(500); } catch (InterruptedException e1) { // ignore it } newLen = fs.getFileStatus(partPath).getLen(); } if (newLen != bucketState.currentFileValidLength) { throw new RuntimeException("Truncate did not truncate to right length. Should be " + bucketState.currentFileValidLength + " is " + newLen + "."); } } } else { LOG.debug("Writing valid-length file for {} to specify valid length {}", partPath, bucketState.currentFileValidLength); Path validLengthFilePath = new Path(partPath.getParent(), validLengthPrefix + partPath.getName()).suffix(validLengthSuffix); if (!fs.exists(validLengthFilePath)) { FSDataOutputStream lengthFileOut = fs.create(validLengthFilePath); lengthFileOut.writeUTF(Long.toString(bucketState.currentFileValidLength)); lengthFileOut.close(); } } // Now that we've restored the bucket to a valid state, reset the current file info bucketState.currentFile = null; bucketState.currentFileValidLength = -1; } catch (IOException e) { LOG.error("Error while restoring BucketingSink state.", e); throw new RuntimeException("Error while restoring BucketingSink state.", e); } catch (InvocationTargetException | IllegalAccessException e) { LOG.error("Cound not invoke truncate.", e); throw new RuntimeException("Could not invoke truncate.", e); } } LOG.debug("Clearing pending/in-progress files."); // Move files that are confirmed by a checkpoint but did not get moved to final location // because the checkpoint notification did not happen before a failure Set<Long> pastCheckpointIds = bucketState.pendingFilesPerCheckpoint.keySet(); LOG.debug("Moving pending files to final location on restore."); for (Long pastCheckpointId : pastCheckpointIds) { // All the pending files are buckets that have been completed but are waiting to be renamed // to their final name for (String filename : bucketState.pendingFilesPerCheckpoint.get(pastCheckpointId)) { Path finalPath = new Path(filename); Path pendingPath = new Path(finalPath.getParent(), pendingPrefix + finalPath.getName()) .suffix(pendingSuffix); try { if (fs.exists(pendingPath)) { LOG.debug( "(RESTORE) Moving pending file {} to final location after complete checkpoint {}.", pendingPath, pastCheckpointId); fs.rename(pendingPath, finalPath); } } catch (IOException e) { LOG.error("(RESTORE) Error while renaming pending file {} to final path {}: {}", pendingPath, finalPath, e); throw new RuntimeException( "Error while renaming pending file " + pendingPath + " to final path " + finalPath, e); } } } synchronized (bucketState.pendingFilesPerCheckpoint) { bucketState.pendingFilesPerCheckpoint.clear(); } } // we need to get this here since open() has not yet been called int subtaskIndex = getRuntimeContext().getIndexOfThisSubtask(); // delete pending files try { RemoteIterator<LocatedFileStatus> bucketFiles = fs.listFiles(new Path(basePath), true); while (bucketFiles.hasNext()) { LocatedFileStatus file = bucketFiles.next(); if (file.getPath().toString().endsWith(pendingSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting pending file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } if (file.getPath().toString().endsWith(inProgressSuffix)) { // only delete files that contain our subtask index if (file.getPath().toString().contains(partPrefix + "-" + subtaskIndex + "-")) { LOG.debug("(RESTORE) Deleting in-progress file {}", file.getPath().toString()); fs.delete(file.getPath(), true); } } } } catch (IOException e) { LOG.error("Error while deleting old pending files: {}", e); throw new RuntimeException("Error while deleting old pending files.", e); } }
From source file:com.netflix.bdp.s3.S3PartitionedOutputCommitter.java
License:Apache License
@Override protected List<FileStatus> getTaskOutput(TaskAttemptContext context) throws IOException { PathFilter filter = HiddenPathFilter.get(); // get files on the local FS in the attempt path Path attemptPath = getTaskAttemptPath(context); FileSystem attemptFS = attemptPath.getFileSystem(context.getConfiguration()); RemoteIterator<LocatedFileStatus> iter = attemptFS.listFiles(attemptPath, true /* recursive */ ); List<FileStatus> stats = Lists.newArrayList(); while (iter.hasNext()) { FileStatus stat = iter.next();//from www. ja v a 2 s.c o m if (filter.accept(stat.getPath())) { stats.add(stat); } } return stats; }
From source file:com.simiacryptus.mindseye.applications.HadoopUtil.java
License:Apache License
/** * Gets files./*from w w w . j a v a2 s . co m*/ * * @param file the file * @return the files */ public static List<CharSequence> getFiles(CharSequence file) { try { FileSystem fileSystem = getFileSystem(file); Path path = new Path(file.toString()); if (!fileSystem.exists(path)) throw new IllegalStateException(path + " does not exist"); List<CharSequence> collect = toStream(fileSystem.listFiles(path, false)).map(FileStatus::getPath) .map(Path::toString).collect(Collectors.toList()); collect.stream().forEach(child -> { try { if (!fileSystem.exists(new Path(child.toString()))) throw new IllegalStateException(child + " does not exist"); } catch (IOException e) { throw new RuntimeException(e); } }); return collect; } catch (IOException e) { throw new RuntimeException(e); } }
From source file:com.splicemachine.tutorials.vti.ORCRecordIterator.java
License:Apache License
/** * Constructor performs initialization and sets the record iterator * /*from ww w. ja va 2 s .c o m*/ * @param filesystem * : HDFS File System * @param filePath * : File or folder path in HDFS * @param execRow * : Format of the result record * @throws IOException */ public ORCRecordIterator(FileSystem filesystem, Path filePath, ExecRow execRow) { // set the instance variable of filesystem to use later this.filesystem = filesystem; // Set the instance variable of the result record format to be used // later this.execRow = execRow; try { Path curFiletoProcess = null; Reader reader; // Check if filePath specifes afile or folder // If its folder, set the flag, and get the first file in the folder if (filesystem.isDirectory(filePath)) { isDir = true; this.fileList = filesystem.listFiles(filePath, false); curFiletoProcess = fileList.next().getPath(); } else { curFiletoProcess = filePath; } // Get the reader for the single file (first file in case of folder) reader = getReader(curFiletoProcess); // Get the inspector for the format of the record in the ORC File this.inspector = (StructObjectInspector) reader.getObjectInspector(); // Retrieve the Records from reader to process records = reader.rows(); } catch (Exception e) { try { if (records != null) records.close(); } catch (Exception cE) { throw new RuntimeException(cE); } throw new RuntimeException(e); } }