Example usage for org.apache.hadoop.fs Path getName

List of usage examples for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName() 

Source Link

Document

Returns the final component of this path.

Usage

From source file:com.nagarro.nteg.utils.HDFSDirectoryFilesDataReader.java

License:Apache License

@Override
protected FileDataBufferedReader getFileDataBufferedReaderForNewFile() throws IOException {

    final Path hdfsPath = new Path(dirPathName);

    Path locatedFilePath = null;/*from ww  w  .  j  a  v a  2s  . co m*/
    final RemoteIterator<LocatedFileStatus> locatedFileIterator = hdfs.listFiles(hdfsPath, true);
    while (locatedFileIterator != null && locatedFileIterator.hasNext()) {
        final LocatedFileStatus locatedFileStatus = locatedFileIterator.next();

        final Path tmpPath = locatedFileStatus.getPath();

        final String pathName = tmpPath.getName();

        if (LOG.isInfoEnabled()) {
            LOG.info("Checking file with name[Log]: " + pathName);
        }

        if (!(pathName.endsWith(FileDataBufferedReader.IN_PROGRESS_FILE_SUFFIX)
                || pathName.endsWith(FileDataBufferedReader.PROCESSED_FILE_SUFFIX))) {
            locatedFilePath = tmpPath;
            break;
        }
    }

    FileDataBufferedReader fileDataBufferedReader = null;
    if (locatedFilePath != null) {
        fileDataBufferedReader = new HDFSFileDataBufferedReader(locatedFilePath, batchSize);
    }

    return fileDataBufferedReader;
}

From source file:com.netease.news.utils.SplitInput.java

License:Apache License

/**
 * Perform a split on the specified input file. Results will be written to files of the same name in the specified
 * training and test output directories. The {@link #validate()} method is called prior to executing the split.
 *///from  w ww  . j a  v a  2s.c o m
public void splitFile(Path inputFile) throws IOException {
    Configuration conf = getConf();
    FileSystem fs = inputFile.getFileSystem(conf);
    if (fs.getFileStatus(inputFile) == null) {
        throw new IOException(inputFile + " does not exist");
    }
    if (fs.getFileStatus(inputFile).isDir()) {
        throw new IOException(inputFile + " is a directory");
    }

    validate();

    Path testOutputFile = new Path(testOutputDirectory, inputFile.getName());
    Path trainingOutputFile = new Path(trainingOutputDirectory, inputFile.getName());

    int lineCount = countLines(fs, inputFile, charset);

    log.info("{} has {} lines", inputFile.getName(), lineCount);

    int testSplitStart = 0;
    int testSplitSize = this.testSplitSize; // don't modify state
    BitSet randomSel = null;

    if (testRandomSelectionPct > 0 || testRandomSelectionSize > 0) {
        testSplitSize = this.testRandomSelectionSize;

        if (testRandomSelectionPct > 0) {
            testSplitSize = Math.round(lineCount * testRandomSelectionPct / 100.0f);
        }
        log.info("{} test split size is {} based on random selection percentage {}", inputFile.getName(),
                testSplitSize, testRandomSelectionPct);
        long[] ridx = new long[testSplitSize];
        RandomSampler.sample(testSplitSize, lineCount - 1, testSplitSize, 0, ridx, 0, RandomUtils.getRandom());
        randomSel = new BitSet(lineCount);
        for (long idx : ridx) {
            randomSel.set((int) idx + 1);
        }
    } else {
        if (testSplitPct > 0) { // calculate split size based on percentage
            testSplitSize = Math.round(lineCount * testSplitPct / 100.0f);
            log.info("{} test split size is {} based on percentage {}", inputFile.getName(), testSplitSize,
                    testSplitPct);
        } else {
            log.info("{} test split size is {}", inputFile.getName(), testSplitSize);
        }

        if (splitLocation > 0) { // calculate start of split based on percentage
            testSplitStart = Math.round(lineCount * splitLocation / 100.0f);
            if (lineCount - testSplitStart < testSplitSize) {
                // adjust split start downwards based on split size.
                testSplitStart = lineCount - testSplitSize;
            }
            log.info("{} test split start is {} based on split location {}", inputFile.getName(),
                    testSplitStart, splitLocation);
        }

        if (testSplitStart < 0) {
            throw new IllegalArgumentException(
                    "test split size for " + inputFile + " is too large, it would produce an "
                            + "empty training set from the initial set of " + lineCount + " examples");
        } else if (lineCount - testSplitSize < testSplitSize) {
            log.warn(
                    "Test set size for {} may be too large, {} is larger than the number of "
                            + "lines remaining in the training set: {}",
                    inputFile, testSplitSize, lineCount - testSplitSize);
        }
    }
    int trainCount = 0;
    int testCount = 0;
    if (!useSequence) {
        BufferedReader reader = new BufferedReader(new InputStreamReader(fs.open(inputFile), charset));
        Writer trainingWriter = new OutputStreamWriter(fs.create(trainingOutputFile), charset);
        Writer testWriter = new OutputStreamWriter(fs.create(testOutputFile), charset);

        try {

            String line;
            int pos = 0;
            while ((line = reader.readLine()) != null) {
                pos++;

                Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                writer.write(line);
                writer.write('\n');
            }

        } finally {
            Closeables.close(reader, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    } else {
        SequenceFileIterator<Writable, Writable> iterator = new SequenceFileIterator<Writable, Writable>(
                inputFile, false, fs.getConf());
        SequenceFile.Writer trainingWriter = SequenceFile.createWriter(fs, fs.getConf(), trainingOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        SequenceFile.Writer testWriter = SequenceFile.createWriter(fs, fs.getConf(), testOutputFile,
                iterator.getKeyClass(), iterator.getValueClass());
        try {

            int pos = 0;
            while (iterator.hasNext()) {
                pos++;
                SequenceFile.Writer writer;
                if (testRandomSelectionPct > 0) { // Randomly choose
                    writer = randomSel.get(pos) ? testWriter : trainingWriter;
                } else { // Choose based on location
                    writer = pos > testSplitStart ? testWriter : trainingWriter;
                }

                if (writer == testWriter) {
                    if (testCount >= testSplitSize) {
                        writer = trainingWriter;
                    } else {
                        testCount++;
                    }
                }
                if (writer == trainingWriter) {
                    trainCount++;
                }
                Pair<Writable, Writable> pair = iterator.next();
                writer.append(pair.getFirst(), pair.getSecond());
            }

        } finally {
            Closeables.close(iterator, true);
            Closeables.close(trainingWriter, false);
            Closeables.close(testWriter, false);
        }
    }
    log.info("file: {}, input: {} train: {}, test: {} starting at {}", inputFile.getName(), lineCount,
            trainCount, testCount, testSplitStart);

    // testing;
    if (callback != null) {
        callback.splitComplete(inputFile, lineCount, trainCount, testCount, testSplitStart);
    }
}

From source file:com.netflix.aegisthus.input.AegisthusInputFormat.java

License:Apache License

/**
 * The main thing that the addSSTableSplit handles is to split SSTables
 * using their index if available. The general algorithm is that if the file
 * is large than the blocksize plus some fuzzy factor to
 *//*w w w.ja  v  a2  s  .  c  o  m*/
public void addSSTableSplit(List<InputSplit> splits, JobContext job, FileStatus file) throws IOException {
    Path path = file.getPath();
    FileSystem fs = path.getFileSystem(job.getConfiguration());
    long length = file.getLen();
    BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
    if (length != 0) {
        long blockSize = file.getBlockSize();
        long maxSplitSize = (long) (blockSize * .99);
        long fuzzySplit = (long) (blockSize * 1.2);

        long bytesRemaining = length;

        Iterator<Long> scanner = null;
        Path compressionPath = new Path(path.getParent(),
                path.getName().replaceAll("-Data.db", "-CompressionInfo.db"));
        if (!fs.exists(compressionPath)) {
            // Only initialize if we are going to have more than a single
            // split
            if (fuzzySplit < length) {
                Path indexPath = new Path(path.getParent(), path.getName().replaceAll("-Data.db", "-Index.db"));
                if (!fs.exists(indexPath)) {
                    fuzzySplit = length;
                } else {
                    FSDataInputStream fileIn = fs.open(indexPath);
                    scanner = new OffsetScanner(new DataInputStream(new BufferedInputStream(fileIn)),
                            indexPath.getName());
                }
            }
            long splitStart = 0;
            while (splitStart + fuzzySplit < length && scanner.hasNext()) {
                long splitSize = 0;
                // The scanner returns an offset from the start of the file.
                while (splitSize < maxSplitSize && scanner.hasNext()) {
                    splitSize = scanner.next() - splitStart;
                }
                int blkIndex = getBlockIndex(blkLocations, splitStart + (splitSize / 2));
                LOG.info("split path: " + path.getName() + ":" + splitStart + ":" + splitSize);
                splits.add(new AegSplit(path, splitStart, splitSize, blkLocations[blkIndex].getHosts(),
                        convertors));
                bytesRemaining -= splitSize;
                splitStart += splitSize;
            }
        }

        if (bytesRemaining != 0) {
            LOG.info("end path: " + path.getName() + ":" + (length - bytesRemaining) + ":" + bytesRemaining);
            splits.add(new AegSplit(path, length - bytesRemaining, bytesRemaining,
                    blkLocations[blkLocations.length - 1].getHosts(), convertors, fs.exists(compressionPath),
                    compressionPath));
        }
    } else {
        LOG.info("skipping zero length file: " + path.toString());
    }
}

From source file:com.netflix.aegisthus.tools.StorageHelper.java

License:Apache License

public void copyToTemp(String file, String prefix, boolean snappy) throws IOException {
    String target = getBaseTaskAttemptTempLocation();
    Path targetPath = new Path(target, prefix);
    Path filePath = new Path(file);
    Path fullPath = new Path(targetPath, filePath.getName());

    String log = String.format("copying %s to %s", file, fullPath.toUri().toString());
    LOG.info(log);// w w w.  j  av  a  2s. c  om
    ctx.setStatus(log);
    Utils.copy(new Path(file), fullPath, snappy, ctx);
}

From source file:com.netflix.aegisthus.tools.StorageHelper.java

License:Apache License

public void copyToTemp(String file, boolean snappy) throws IOException {
    String target = getBaseTaskAttemptTempLocation();
    Path targetPath = new Path(target);
    Path filePath = new Path(file);
    Path fullPath = new Path(targetPath, filePath.getName());
    String log = String.format("copying %s to %s", file, fullPath.toUri().toString());
    LOG.info(log);//w ww . j a  va  2 s.  c o  m
    ctx.setStatus(log);
    Utils.copy(filePath, fullPath, snappy, ctx);
}

From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java

License:Apache License

@Test
public void testSingleTaskCommit() throws Exception {
    Path file = new Path(commitTask(committer, tac, 1).iterator().next());

    List<String> uploads = committer.results.getUploads();
    Assert.assertEquals("Should initiate one upload", 1, uploads.size());

    Path committedPath = committer.getCommittedTaskPath(tac);
    FileSystem dfs = committedPath.getFileSystem(conf);

    Assert.assertEquals("Should commit to HDFS", getDFS(), dfs);

    FileStatus[] stats = dfs.listStatus(committedPath);
    Assert.assertEquals("Should produce one commit file", 1, stats.length);
    Assert.assertEquals("Should name the commits file with the task ID", "task_job_0001_r_000002",
            stats[0].getPath().getName());

    List<S3Util.PendingUpload> pending = S3Util.readPendingCommits(dfs, stats[0].getPath());
    Assert.assertEquals("Should have one pending commit", 1, pending.size());

    S3Util.PendingUpload commit = pending.get(0);
    Assert.assertEquals("Should write to the correct bucket", BUCKET, commit.getBucketName());
    Assert.assertEquals("Should write to the correct key", KEY_PREFIX + "/" + file.getName(), commit.getKey());

    assertValidUpload(committer.results.getTagsByUpload(), commit);
}

From source file:com.netflix.bdp.s3.TestS3MultipartOutputCommitter.java

License:Apache License

private static Set<String> commitTask(S3MultipartOutputCommitter committer, TaskAttemptContext tac,
        int numFiles) throws IOException {
    Path attemptPath = committer.getTaskAttemptPath(tac);

    Set<String> files = Sets.newHashSet();
    for (int i = 0; i < numFiles; i += 1) {
        Path outPath = writeOutputFile(tac.getTaskAttemptID(), attemptPath, UUID.randomUUID().toString(),
                10 * (i + 1));/*from   w ww . j a  va 2  s.co  m*/
        files.add(KEY_PREFIX + "/" + outPath.getName() + "-" + committer.getUUID());
    }

    committer.commitTask(tac);

    return files;
}

From source file:com.netflix.bdp.s3mper.listing.BigTableGcsConsistentListingAspectTest.java

License:Apache License

@Test
public void testRenameFolderToPreexistingFolder() throws Throwable {
    Path folder1 = new Path(testPath + "/rename/");
    Path folder2 = new Path(testPath + "/rename2/");
    Path file = new Path(folder1, "file.test");

    assertTrue(deleteFs.mkdirs(folder1));
    assertTrue(deleteFs.mkdirs(folder2));

    validateMetadata(testPath,//from w  ww.j  a v a2  s. co m
            new FileInfo[] { new FileInfo(folder1, false, true), new FileInfo(folder2, false, true) });

    OutputStream fout = deleteFs.create(file);
    assertNotNull(fout);
    fout.close();

    validateMetadata(folder1, new FileInfo(file, false, false));

    deleteFs.rename(folder1, folder2);

    validateMetadata(new Path(testPath + "/rename2"),
            new FileInfo(new Path(testPath + "/rename2/" + folder1.getName()), false, true));
    validateMetadata(new Path(testPath + "/rename2/rename"),
            new FileInfo(new Path(testPath + "/rename2/rename/" + file.getName()), false, false));
}

From source file:com.netflix.bdp.s3mper.listing.BigTableGcsConsistentListingAspectTest.java

License:Apache License

@Test
public void testRenameFolderToNonexistingFolder() throws Throwable {
    Path folder1 = new Path(testPath + "/rename/");
    Path folder2 = new Path(testPath + "/rename2/");
    Path file = new Path(folder1, "file.test");

    assertTrue(deleteFs.mkdirs(folder1));

    validateMetadata(testPath, new FileInfo(folder1, false, true));

    OutputStream fout = deleteFs.create(file);
    assertNotNull(fout);/*from  ww  w .j a va  2s .co m*/
    fout.close();

    validateMetadata(folder1, new FileInfo(file, false, false));

    deleteFs.rename(folder1, folder2);

    validateMetadata(testPath, new FileInfo(folder2, false, true));
    validateMetadata(folder2, new FileInfo(new Path(folder2, file.getName()), false, false));
}

From source file:com.packetloop.packetpig.storage.JsonMetadata.java

License:Apache License

/**.
 * Given a path, which may represent a glob pattern, a directory,
 * comma separated files/glob patterns or a file, this method
 * finds the set of relevant metadata files on the storage system.
 * The algorithm for finding the metadata file is as follows:
 * <p>//from  w w  w.jav  a2 s  .co  m
 * For each object represented by the path (either directly, or via a glob):
 *   If object is a directory, and path/metaname exists, use that as the metadata file.
 *   Else if parentPath/metaname exists, use that as the metadata file.
 * <p>
 * Resolving conflicts, merging the metadata, etc, is not handled by this method and should be
 * taken care of by downstream code.
 * <p>
 * @param path      Path, as passed in to a LoadFunc (may be a Hadoop glob)
 * @param metaname    Metadata file designation, such as .pig_schema or .pig_stats
 * @param conf      configuration object
 * @return Set of element descriptors for all metadata files associated with the files on the path.
 */
protected Set<ElementDescriptor> findMetaFile(String path, String metaname, Configuration conf)
        throws IOException {
    Set<ElementDescriptor> metaFileSet = new HashSet<ElementDescriptor>();
    String[] locations = LoadFunc.getPathStrings(path);
    for (String loc : locations) {
        DataStorage storage;

        storage = new HDataStorage(new Path(loc).toUri(), ConfigurationUtil.toProperties(conf));

        String fullPath = FileLocalizer.fullPath(loc, storage);

        if (storage.isContainer(fullPath)) {
            ElementDescriptor metaFilePath = storage.asElement(fullPath, metaname);
            if (exists(metaFilePath)) {
                metaFileSet.add(metaFilePath);
            }
        } else {
            ElementDescriptor[] descriptors = storage.asCollection(loc);
            for (ElementDescriptor descriptor : descriptors) {
                ContainerDescriptor container = null;

                if (descriptor instanceof HFile) {
                    Path descriptorPath = ((HPath) descriptor).getPath();
                    String fileName = descriptorPath.getName();
                    Path parent = descriptorPath.getParent();
                    String parentName = parent.toString();
                    container = new HDirectory((HDataStorage) storage, parent);
                } else { // descriptor instanceof HDirectory
                    container = (HDirectory) descriptor;
                }

                // if no custom schema, try the parent directory
                ElementDescriptor metaFilePath = storage.asElement(container, metaname);
                if (exists(metaFilePath)) {
                    metaFileSet.add(metaFilePath);
                }
            }
        }
    }
    return metaFileSet;
}