org.apache.hadoop.fs.tar.TarFileSystem.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.hadoop.fs.tar.TarFileSystem.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.hadoop.fs.tar;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;

import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.BufferedFSInputStream;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.permission.FsPermission;
import org.apache.hadoop.util.Progressable;

/**
 * Creates a FileSystem out of a TAR file.
 * A tar file is treated as a directory containing files.
 * Sub-directories are not yet supported.
 * Tar files can be specified using following sample URI schema <br/>
 * 
 * tar://hdfs-namenode:port/tarfile.tar (the whole tar.. treated as a directory)<br/>
 * tar://hdfs-namenode:port/tarfile.tar+somefile.txt
 * <br/>
 * <ul>
 *    <li>TODO subdirectories within a tar is not yet supported</li>
 *    <li>TODO tarfiles must not contain '+' anywhere in the path</li>
 *    <li>TODO TAR.GZ files are not yet supported. GZ is not seekable.</li>
 * </ul>
 * @author joydip
 *
 */
public class TarFileSystem extends FileSystem {

    private URI uri;
    //private String      prefix;
    private TarIndex index;
    private FileSystem underlyingFS = null;
    private Path workingDir;

    private static final String TAR_URLPREFIX = "tar://";
    private static final char TAR_INFILESEP = '+';

    public static final Log LOG = LogFactory.getLog(TarFileSystem.class);

    @Override
    public void initialize(URI name, Configuration conf) throws IOException {
        LOG.info("*** Using Tar file system ***");
        super.initialize(name, conf);

        this.underlyingFS = TarFSUtils.getHadoopFS(getBaseTarPath(new Path(name)).toUri(), conf);
        this.index = new TarIndex(underlyingFS, getBaseTarPath(new Path(name)), true, conf);

        initURI(name, conf);
        setConf(conf);
    }

    private void initURI(URI name, Configuration conf) {
        if (name.getAuthority() != null) {
            String uriStr = name.getScheme() + "://" + name.getAuthority();
            this.uri = URI.create(uriStr);
        } else {
            this.uri = URI.create(name.getScheme() + ":///");
        }
    }

    @Override
    public URI getUri() {
        return uri;
    }

    private String getFileInArchive(Path tarPath) {
        String fullUri = tarPath.toUri().toString();
        int i = fullUri.lastIndexOf(TAR_INFILESEP);
        if (i == -1)
            return null;
        return fullUri.substring(i + 1);
    }

    /**
     * Creates an URI for base tar-ball from an URI in tar FS schema
     * @param tarPath
     * @return
     * @throws URISyntaxException 
     */
    private Path getBaseTarPath(Path tarPath) {
        URI uri = tarPath.toUri();

        // form the prefix part   
        String basePrefix = uri.getAuthority();
        if (basePrefix != null)
            basePrefix = basePrefix.replaceFirst("-", "://");
        else
            basePrefix = "";

        // form the path component
        String basePath = uri.getPath();
        // strip the part containing inFile name
        int lastPlusIndex = basePath.lastIndexOf(TAR_INFILESEP);
        if (lastPlusIndex != -1)
            basePath = basePath.substring(0, lastPlusIndex);

        basePath = basePrefix + basePath;

        return new Path(basePath);
    }

    private Path makeAbsolute(Path path) {
        String pathStr = path.toUri().toASCIIString();
        if (pathStr.startsWith(TAR_URLPREFIX))
            return path;
        else
            return new Path(getUri() + path.toUri().toString());
    }

    @Override
    public FSDataInputStream open(Path f, int bufferSize) throws IOException {

        Path baseTarPath = getBaseTarPath(f);
        String inFile = getFileInArchive(f);

        if (inFile == null)
            throw new IOException("TAR FileSystem: Can not open the whole TAR");

        // adjust for the header
        long offset = index.getOffset(inFile);
        long size = index.getSize(inFile);

        FSDataInputStream in = underlyingFS.open(baseTarPath);

        in.seek(offset - 512);
        TarArchiveEntry entry = readHeaderEntry(in);
        if (!entry.getName().equals(inFile)) {
            LOG.fatal("Index file is corrupt." + "Requested filename is present in index " + "but absent in TAR.");
            throw new IOException("Requested filename does not match ");
        }

        return new FSDataInputStream(
                new BufferedFSInputStream(new SeekableTarInputStream(in, size, offset), bufferSize));
    }

    private TarArchiveEntry readHeaderEntry(InputStream is) throws IOException {
        byte[] buffer = new byte[512];
        readHeaderBuffer(is, buffer);
        return new TarArchiveEntry(buffer);
    }

    private TarArchiveEntry readHeaderEntry(InputStream is, byte[] buffer) throws IOException {
        readHeaderBuffer(is, buffer);
        return new TarArchiveEntry(buffer);
    }

    private void readHeaderBuffer(InputStream is, byte[] buffer) throws IOException {
        int bytesRead = is.read(buffer);
        if (bytesRead == -1)
            throw new IOException("EOF Occurred while reading buffer.");
        if (bytesRead < 512)
            throw new IOException("Could not read the full header.");
    }

    @Override
    public FileStatus[] listStatus(Path f) throws IOException {
        ArrayList<FileStatus> ret = new ArrayList<FileStatus>();
        Path abs = makeAbsolute(f);
        Path baseTar = getBaseTarPath(abs);
        String inFile = getFileInArchive(abs);
        FileStatus underlying = underlyingFS.getFileStatus(baseTar);

        // if subfile exists in the path, just return the status of that
        if (inFile != null) {
            ret.add(getFileStatus(abs));
        }

        else {
            FSDataInputStream in = underlyingFS.open(baseTar);
            byte[] buffer = new byte[512];

            for (long offset : index.getOffsetList()) {
                in.seek(offset - 512); // adjust for the header
                TarArchiveEntry entry = readHeaderEntry(in, buffer);

                // Construct a FileStatus object 
                FileStatus fstatus = new FileStatus(entry.getSize(), entry.isDirectory(),
                        (int) underlying.getReplication(), underlying.getBlockSize(), entry.getModTime().getTime(),
                        underlying.getAccessTime(), new FsPermission((short) entry.getMode()), entry.getUserName(),
                        entry.getGroupName(),
                        new Path(abs.toUri().toASCIIString() + TAR_INFILESEP + entry.getName()));
                ret.add(fstatus);
            }
        }

        // copy back
        FileStatus[] retArray = new FileStatus[ret.size()];
        ret.toArray(retArray);
        return retArray;
    }

    @Override
    public void setWorkingDirectory(Path new_dir) {
        this.workingDir = new_dir;

    }

    @Override
    public Path getWorkingDirectory() {
        return this.workingDir;
    }

    @Override
    public FileStatus getFileStatus(Path f) throws IOException {
        FileStatus fstatus = null;
        Path abs = makeAbsolute(f);
        Path baseTar = getBaseTarPath(abs);
        String inFile = getFileInArchive(abs);

        FileStatus underlying = underlyingFS.getFileStatus(baseTar);

        if (inFile == null) {
            // return the status of the tar itself but make it a dir
            fstatus = new FileStatus(underlying.getLen(), true, underlying.getReplication(),
                    underlying.getBlockSize(), underlying.getModificationTime(), underlying.getAccessTime(),
                    underlying.getPermission(), underlying.getOwner(), underlying.getGroup(), abs);
        }

        else {
            long offset = index.getOffset(inFile);

            FSDataInputStream in = underlyingFS.open(baseTar);
            in.seek(offset - 512);
            TarArchiveEntry entry = readHeaderEntry(in);

            if (!entry.getName().equals(inFile)) {
                LOG.fatal("Index file is corrupt." + "Requested filename is present in index "
                        + "but absent in TAR.");
                throw new IOException("NBU-TAR: FATAL: entry file name " + "does not match requested file name");
            }

            // Construct a FileStatus object 
            fstatus = new FileStatus(entry.getSize(), entry.isDirectory(), (int) underlying.getReplication(),
                    underlying.getBlockSize(), entry.getModTime().getTime(), underlying.getAccessTime(),
                    new FsPermission((short) entry.getMode()), entry.getUserName(), entry.getGroupName(), abs);
        }
        return fstatus;
    }

    /*
     * DISABLED METHODS FOR READ-ONLY FILE SYSTEM
     */
    private String notSupportedMsg(String op) throws IOException {
        return op + " is not supported on a Tar File System";
    }

    @Override
    public FSDataOutputStream create(Path f, FsPermission permission, boolean overwrite, int bufferSize,
            short replication, long blockSize, Progressable progress) throws IOException {
        throw new IOException(notSupportedMsg("Create"));
    }

    @Override
    public FSDataOutputStream append(Path f, int bufferSize, Progressable progress) throws IOException {
        throw new IOException(notSupportedMsg("Append"));
    }

    @Override
    public boolean rename(Path src, Path dst) throws IOException {
        throw new IOException(notSupportedMsg("Rename"));
    }

    @Override
    @Deprecated
    public boolean delete(Path f) throws IOException {
        throw new IOException(notSupportedMsg("Delete"));
    }

    @Override
    public boolean delete(Path f, boolean recursive) throws IOException {
        throw new IOException(notSupportedMsg("Delete"));
    }

    @Override
    public boolean mkdirs(Path f, FsPermission permission) throws IOException {
        throw new IOException(notSupportedMsg("Mkdir"));
    }

}