Example usage for org.apache.hadoop.fs FileSystem concat

List of usage examples for org.apache.hadoop.fs FileSystem concat

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem concat.

Prototype

public void concat(final Path trg, final Path[] psrcs) throws IOException 

Source Link

Document

Concat existing files together.

Usage

From source file:a.TestConcatExample.java

License:Apache License

@Test
public void concatIsPermissive() throws IOException, URISyntaxException {
    MiniDFSCluster cluster = null;/*from  w w  w .j  av  a 2  s .  com*/
    final Configuration conf = WebHdfsTestUtil.createConf();
    conf.set("dfs.namenode.fs-limits.min-block-size", "1000"); // Allow tiny blocks for the test
    try {
        cluster = new MiniDFSCluster.Builder(conf).numDataNodes(2).build();
        cluster.waitActive();
        final FileSystem webHdfs = WebHdfsTestUtil.getWebHdfsFileSystem(conf, WebHdfsFileSystem.SCHEME);
        final FileSystem dfs = cluster.getFileSystem();

        final FileSystem fs = dfs; // WebHDFS has a bug in getLocatedBlocks

        Path root = new Path("/dir");
        fs.mkdirs(root);

        short origRep = 3;
        short secondRep = (short) (origRep - 1);
        Path f1 = new Path("/dir/f1");
        long size1 = writeFile(fs, f1, /* blocksize */ 4096, origRep, 5);
        long f1NumBlocks = fs.getFileBlockLocations(f1, 0, size1).length;
        assertEquals(5, f1NumBlocks);

        Path f2 = new Path("/dir/f2");
        long size2 = writeFile(fs, f2, /* blocksize (must divide 512 for checksum) */ 4096 - 512, secondRep, 4);
        long f2NumBlocks = fs.getFileBlockLocations(f2, 0, size2).length;
        assertEquals(5, f2NumBlocks);

        fs.concat(f1, new Path[] { f2 });
        FileStatus[] fileStatuses = fs.listStatus(root);

        // Only one file should remain
        assertEquals(1, fileStatuses.length);
        FileStatus fileStatus = fileStatuses[0];

        // And it should be named after the first file
        assertEquals("f1", fileStatus.getPath().getName());

        // The entire file takes the replication of the first argument
        assertEquals(origRep, fileStatus.getReplication());

        // As expected, the new concated file is the length of both the previous files
        assertEquals(size1 + size2, fileStatus.getLen());

        // And we should have the same number of blocks
        assertEquals(f1NumBlocks + f2NumBlocks,
                fs.getFileBlockLocations(fileStatus.getPath(), 0, size1 + size2).length);
    } finally {
        if (cluster != null) {
            cluster.shutdown();
        }

    }
}

From source file:org.apache.gobblin.data.management.copy.splitter.DistcpFileSplitter.java

License:Apache License

/**
 * Merges all the splits for a given file.
 * Should be called on the target/destination file system (after blocks have been copied to targetFs).
 * @param fs {@link FileSystem} where file parts exist.
 * @param file {@link CopyableFile} to merge.
 * @param workUnits {@link WorkUnitState}s for all parts of this file.
 * @param parentPath {@link Path} where the parts of the file are located.
 * @return a {@link WorkUnit} equivalent to the distcp work unit if the file had not been split.
 * @throws IOException//ww  w .jav a2s  . c  o  m
 */
private static WorkUnitState mergeSplits(FileSystem fs, CopyableFile file, Collection<WorkUnitState> workUnits,
        Path parentPath) throws IOException {

    log.info(String.format("File %s was written in %d parts. Merging.", file.getDestination(),
            workUnits.size()));
    Path[] parts = new Path[workUnits.size()];
    for (WorkUnitState workUnit : workUnits) {
        if (!isSplitWorkUnit(workUnit)) {
            throw new IOException("Not a split work unit.");
        }
        Split split = getSplit(workUnit).get();
        parts[split.getSplitNumber()] = new Path(parentPath, split.getPartName());
    }

    Path target = new Path(parentPath, file.getDestination().getName());

    fs.rename(parts[0], target);
    fs.concat(target, Arrays.copyOfRange(parts, 1, parts.length));

    WorkUnitState finalWorkUnit = workUnits.iterator().next();
    finalWorkUnit.removeProp(SPLIT_KEY);
    return finalWorkUnit;
}