List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.nagarro.nteg.utils.HDFSDirectoryFilesDataReader.java
License:Apache License
@Override protected FileDataBufferedReader getFileDataBufferedReaderForNewFile() throws IOException { final Path hdfsPath = new Path(dirPathName); Path locatedFilePath = null;/*from ww w . j a v a 2s . co m*/ final RemoteIterator<LocatedFileStatus> locatedFileIterator = hdfs.listFiles(hdfsPath, true); while (locatedFileIterator != null && locatedFileIterator.hasNext()) { final LocatedFileStatus locatedFileStatus = locatedFileIterator.next(); final Path tmpPath = locatedFileStatus.getPath(); final String pathName = tmpPath.getName(); if (LOG.isInfoEnabled()) { LOG.info("Checking file with name[Log]: " + pathName); } if (!(pathName.endsWith(FileDataBufferedReader.IN_PROGRESS_FILE_SUFFIX) || pathName.endsWith(FileDataBufferedReader.PROCESSED_FILE_SUFFIX))) { locatedFilePath = tmpPath; break; } } FileDataBufferedReader fileDataBufferedReader = null; if (locatedFilePath != null) { fileDataBufferedReader = new HDFSFileDataBufferedReader(locatedFilePath, batchSize); } return fileDataBufferedReader; }
From source file:com.netease.news.utils.SplitInput.java
License:Apache License
/** * Perform a split on the specified input file. Results will be written to files of the same name in the specified * training and test output directories. The {@link #validate()} method is called prior to executing the split. *///from w ww . j a v a 2s.c o m public void splitFile(Path inputFile) throws IOException { Configuration conf = getConf(); FileSystem fs = inputFile.getFileSystem(conf); if (fs.getFileStatus(inputFile) == null) { throw new IOException(inputFile + " does not exist"); } if (fs.getFileStatus(inputFile).isDir()) { throw new IOException(inputFile + " is a directory"); } validate(); Path testOutputFile = new Path(testOutputDirectory, inputFile.getName()); Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName()); int lineCount = countLines(fs, inputFile, charset); log.info("{} has {} lines", inputFile.getName(), lineCount); int testSplitStart = 0; int testSplitSize = this.testSplitSize; // don't modify state BitSet randomSel = null; if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) { testSplitSize = this.testRandomSelectionSize; if (testRandomSelectionPct > 0) { testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f); } log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(), testSplitSize, testRandomSelectionPct); long[] ridx = new long[testSplitSize]; RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom()); randomSel = new BitSet(lineCount); for (long idx : ridx) { randomSel.set((int) idx + 1); } } else { if (testSplitPct > 0) { // calculate split size based on percentage testSplitSize = Math.round(lineCount * testSplitPct / 100.0f); log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize, testSplitPct); } else { log.info("{} test split size is {}", inputFile.getName(), testSplitSize); } if (splitLocation > 0) { // calculate start of split based on percentage testSplitStart = Math.round(lineCount * splitLocation / 100.0f); if (lineCount - testSplitStart < testSplitSize) { // adjust split start downwards based on split size. testSplitStart = lineCount - testSplitSize; } log.info("{} test split start is {} based on split location {}", inputFile.getName(), testSplitStart, splitLocation); } if (testSplitStart < 0) { throw new IllegalArgumentException( "test split size for " + inputFile + " is too large, it would produce an " + "empty training set from the initial set of " + lineCount + " examples"); } else if (lineCount - testSplitSize < testSplitSize) { log.warn( "Test set size for {} may be too large, {} is larger than the number of " + "lines remaining in the training set: {}", inputFile, testSplitSize, lineCount - testSplitSize); } } int trainCount = 0; int testCount = 0; if (!useSequence) { BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset)); Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset); Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset); try { String line; int pos = 0; while ((line = reader.readLine()) != null) { pos++; Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } writer.write(line); writer.write('\n'); } } finally { Closeables.close(reader, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } else { SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>( inputFile, false, fs.getConf()); SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile, iterator.getKeyClass(), iterator.getValueClass()); SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile, iterator.getKeyClass(), iterator.getValueClass()); try { int pos = 0; while (iterator.hasNext()) { pos++; SequenceFile.Writer writer; if (testRandomSelectionPct > 0) { // Randomly choose writer = randomSel.get(pos) ? testWriter : trainingWriter; } else { // Choose based on location writer = pos > testSplitStart ? testWriter : trainingWriter; } if (writer == testWriter) { if (testCount >= testSplitSize) { writer = trainingWriter; } else { testCount++; } } if (writer == trainingWriter) { trainCount++; } Pair<Writable, Writable> pair = iterator.next(); writer.append(pair.getFirst(), pair.getSecond()); } } finally { Closeables.close(iterator, true); Closeables.close(trainingWriter, false); Closeables.close(testWriter, false); } } log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount, trainCount, testCount, testSplitStart); // testing; if (callback != null) { callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart); } }
From source file:com.netflix.aegisthus.input.AegisthusInputFormat.java
License:Apache License
/** * The main thing that the addSSTableSplit handles is to split SSTables * using their index if available. The general algorithm is that if the file * is large than the blocksize plus some fuzzy factor to *//*w w w.ja v a2 s . c o m*/ public void addSSTableSplit(List<InputSplit> splits, JobContext job, FileStatus file) throws IOException { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (length != 0) { long blockSize = file.getBlockSize(); long maxSplitSize = (long) (blockSize * .99); long fuzzySplit = (long) (blockSize * 1.2); long bytesRemaining = length; Iterator<Long> scanner = null; Path compressionPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-CompressionInfo.db")); if (!fs.exists(compressionPath)) { // Only initialize if we are going to have more than a single // split if (fuzzySplit < length) { Path indexPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-Index.db")); if (!fs.exists(indexPath)) { fuzzySplit = length; } else { FSDataInputStream fileIn = fs.open(indexPath); scanner = new OffsetScanner(new DataInputStream(new BufferedInputStream(fileIn)), indexPath.getName()); } } long splitStart = 0; while (splitStart + fuzzySplit < length && scanner.hasNext()) { long splitSize = 0; // The scanner returns an offset from the start of the file. while (splitSize < maxSplitSize && scanner.hasNext()) { splitSize = scanner.next() - splitStart; } int blkIndex = getBlockIndex(blkLocations, splitStart + (splitSize / 2)); LOG.info("split path: " + path.getName() + ":" + splitStart + ":" + splitSize); splits.add(new AegSplit(path, splitStart, splitSize, blkLocations[blkIndex].getHosts(), convertors)); bytesRemaining -= splitSize; splitStart += splitSize; } } if (bytesRemaining != 0) { LOG.info("end path: " + path.getName() + ":" + (length - bytesRemaining) + ":" + bytesRemaining); splits.add(new AegSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts(), convertors, fs.exists(compressionPath), compressionPath)); } } else { LOG.info("skipping zero length file: " + path.toString()); } }
From source file:com.netflix.aegisthus.tools.StorageHelper.java
License:Apache License
public void copyToTemp(String file, String prefix, boolean snappy) throws IOException { String target = getBaseTaskAttemptTempLocation(); Path targetPath = new Path(target, prefix); Path filePath = new Path(file); Path fullPath = new Path(targetPath, filePath.getName()); String log = String.format("copying %s to %s", file, fullPath.toUri().toString()); LOG.info(log);// w w w. j av a 2s. c om ctx.setStatus(log); Utils.copy(new Path(file), fullPath, snappy, ctx); }
From source file:com.netflix.aegisthus.tools.StorageHelper.java
License:Apache License
public void copyToTemp(String file, boolean snappy) throws IOException { String target = getBaseTaskAttemptTempLocation(); Path targetPath = new Path(target); Path filePath = new Path(file); Path fullPath = new Path(targetPath, filePath.getName()); String log = String.format("copying %s to %s", file, fullPath.toUri().toString()); LOG.info(log);//w ww . j a va 2 s. c o m ctx.setStatus(log); Utils.copy(filePath, fullPath, snappy, ctx); }
From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java
License:Apache License
@Test public void testSingleTaskCommit() throws Exception { Path file = new Path(commitTask(committer, tac, 1).iterator().next()); List<String> uploads = committer.results.getUploads(); Assert.assertEquals("Should initiate one upload", 1, uploads.size()); Path committedPath = committer.getCommittedTaskPath(tac); FileSystem dfs = committedPath.getFileSystem(conf); Assert.assertEquals("Should commit to HDFS", getDFS(), dfs); FileStatus[] stats = dfs.listStatus(committedPath); Assert.assertEquals("Should produce one commit file", 1, stats.length); Assert.assertEquals("Should name the commits file with the task ID", "task_job_0001_r_000002", stats[0].getPath().getName()); List<S3Util.PendingUpload> pending = S3Util.readPendingCommits(dfs, stats[0].getPath()); Assert.assertEquals("Should have one pending commit", 1, pending.size()); S3Util.PendingUpload commit = pending.get(0); Assert.assertEquals("Should write to the correct bucket", BUCKET, commit.getBucketName()); Assert.assertEquals("Should write to the correct key", KEY_PREFIX + "/" + file.getName(), commit.getKey()); assertValidUpload(committer.results.getTagsByUpload(), commit); }
From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java
License:Apache License
private static Set<String> commitTask(S3MultipartOutputCommitter committer, TaskAttemptContext tac, int numFiles) throws IOException { Path attemptPath = committer.getTaskAttemptPath(tac); Set<String> files = Sets.newHashSet(); for (int i = 0; i < numFiles; i += 1) { Path outPath = writeOutputFile(tac.getTaskAttemptID(), attemptPath, UUID.randomUUID().toString(), 10 * (i + 1));/*from w ww . j a va 2 s.co m*/ files.add(KEY_PREFIX + "/" + outPath.getName() + "-" + committer.getUUID()); } committer.commitTask(tac); return files; }
From source file:com.netflix.bdp.s3mper.listing.BigTableGcsConsistentListingAspectTest.java
License:Apache License
@Test public void testRenameFolderToPreexistingFolder() throws Throwable { Path folder1 = new Path(testPath + "/rename/"); Path folder2 = new Path(testPath + "/rename2/"); Path file = new Path(folder1, "file.test"); assertTrue(deleteFs.mkdirs(folder1)); assertTrue(deleteFs.mkdirs(folder2)); validateMetadata(testPath,//from w ww.j a v a2 s. co m new FileInfo[] { new FileInfo(folder1, false, true), new FileInfo(folder2, false, true) }); OutputStream fout = deleteFs.create(file); assertNotNull(fout); fout.close(); validateMetadata(folder1, new FileInfo(file, false, false)); deleteFs.rename(folder1, folder2); validateMetadata(new Path(testPath + "/rename2"), new FileInfo(new Path(testPath + "/rename2/" + folder1.getName()), false, true)); validateMetadata(new Path(testPath + "/rename2/rename"), new FileInfo(new Path(testPath + "/rename2/rename/" + file.getName()), false, false)); }
From source file:com.netflix.bdp.s3mper.listing.BigTableGcsConsistentListingAspectTest.java
License:Apache License
@Test public void testRenameFolderToNonexistingFolder() throws Throwable { Path folder1 = new Path(testPath + "/rename/"); Path folder2 = new Path(testPath + "/rename2/"); Path file = new Path(folder1, "file.test"); assertTrue(deleteFs.mkdirs(folder1)); validateMetadata(testPath, new FileInfo(folder1, false, true)); OutputStream fout = deleteFs.create(file); assertNotNull(fout);/*from ww w .j a va 2s .co m*/ fout.close(); validateMetadata(folder1, new FileInfo(file, false, false)); deleteFs.rename(folder1, folder2); validateMetadata(testPath, new FileInfo(folder2, false, true)); validateMetadata(folder2, new FileInfo(new Path(folder2, file.getName()), false, false)); }
From source file:com.packetloop.packetpig.storage.JsonMetadata.java
License:Apache License
/**. * Given a path, which may represent a glob pattern, a directory, * comma separated files/glob patterns or a file, this method * finds the set of relevant metadata files on the storage system. * The algorithm for finding the metadata file is as follows: * <p>//from w w w.jav a2 s .co m * For each object represented by the path (either directly, or via a glob): * If object is a directory, and path/metaname exists, use that as the metadata file. * Else if parentPath/metaname exists, use that as the metadata file. * <p> * Resolving conflicts, merging the metadata, etc, is not handled by this method and should be * taken care of by downstream code. * <p> * @param path Path, as passed in to a LoadFunc (may be a Hadoop glob) * @param metaname Metadata file designation, such as .pig_schema or .pig_stats * @param conf configuration object * @return Set of element descriptors for all metadata files associated with the files on the path. */ protected Set<ElementDescriptor> findMetaFile(String path, String metaname, Configuration conf) throws IOException { Set<ElementDescriptor> metaFileSet = new HashSet<ElementDescriptor>(); String[] locations = LoadFunc.getPathStrings(path); for (String loc : locations) { DataStorage storage; storage = new HDataStorage(new Path(loc).toUri(), ConfigurationUtil.toProperties(conf)); String fullPath = FileLocalizer.fullPath(loc, storage); if (storage.isContainer(fullPath)) { ElementDescriptor metaFilePath = storage.asElement(fullPath, metaname); if (exists(metaFilePath)) { metaFileSet.add(metaFilePath); } } else { ElementDescriptor[] descriptors = storage.asCollection(loc); for (ElementDescriptor descriptor : descriptors) { ContainerDescriptor container = null; if (descriptor instanceof HFile) { Path descriptorPath = ((HPath) descriptor).getPath(); String fileName = descriptorPath.getName(); Path parent = descriptorPath.getParent(); String parentName = parent.toString(); container = new HDirectory((HDataStorage) storage, parent); } else { // descriptor instanceof HDirectory container = (HDirectory) descriptor; } // if no custom schema, try the parent directory ElementDescriptor metaFilePath = storage.asElement(container, metaname); if (exists(metaFilePath)) { metaFileSet.add(metaFilePath); } } } } return metaFileSet; }