org.icgc.dcc.submission.validation.first.io.FPVFileSystem.java Source code

Java tutorial

Introduction

Here is the source code for org.icgc.dcc.submission.validation.first.io.FPVFileSystem.java

Source

/*
 * Copyright (c) 2014 The Ontario Institute for Cancer Research. All rights reserved.                             
 *                                                                                                               
 * This program and the accompanying materials are made available under the terms of the GNU Public License v3.0.
 * You should have received a copy of the GNU General Public License along with                                  
 * this program. If not, see <http://www.gnu.org/licenses/>.                                                     
 *                                                                                                               
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY                           
 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES                          
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT                           
 * SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                                
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED                          
 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;                               
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER                              
 * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN                         
 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
package org.icgc.dcc.submission.validation.first.io;

import static com.google.common.collect.ImmutableList.copyOf;
import static java.util.regex.Pattern.compile;
import static org.icgc.dcc.submission.validation.platform.SubmissionPlatformStrategy.FIELD_SPLITTER;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.List;
import java.util.zip.GZIPInputStream;

import lombok.Cleanup;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;

import org.apache.hadoop.io.compress.BZip2Codec;
import org.apache.hadoop.io.compress.CompressionInputStream;
import org.apache.tika.Tika;
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.icgc.dcc.submission.fs.SubmissionDirectory;

/**
 * Class representing interactions with the file system in the context of FPV (as a temporary measure to isolate such
 * operations from the FPV at first).
 * <p>
 * TODO: add test for this class (especially after merging {@link Util} in it)
 */
@RequiredArgsConstructor
public class FPVFileSystem {

    public enum CodecType {
        GZIP, BZIP2, PLAIN_TEXT;
    }

    private static final int BUFFER_SIZE = 65536;

    private final SubmissionDirectory submissionDirectory;

    public InputStream getDecompressingInputStream(String fileName) {
        return submissionDirectory.getDecompressingInputStream(fileName);
    }

    public Iterable<String> listMatchingSubmissionFiles(Iterable<String> filePatterns) {
        return submissionDirectory.listFiles(filePatterns);
    }

    public List<String> getMatchingFileNames(String pattern) {
        return copyOf(submissionDirectory.listFile(compile(pattern)));
    }

    public CodecType determineCodecFromFilename(String fileName) {
        Tika tika = new Tika();
        String mediaType = tika.detect(fileName);
        if (mediaType.equals("application/x-gzip")) {
            return CodecType.GZIP;
        } else if (mediaType.equals("application/x-bzip2")) {
            return CodecType.BZIP2;
        }

        return CodecType.PLAIN_TEXT;
    }

    public CodecType determineCodecFromContent(String fileName) throws IOException {
        @Cleanup
        BufferedInputStream bis = new BufferedInputStream(submissionDirectory.open(fileName));
        AutoDetectParser parser = new AutoDetectParser();
        Detector detector = parser.getDetector();
        Metadata md = new Metadata();
        md.add(Metadata.RESOURCE_NAME_KEY, fileName);

        String mediaType = detector.detect(bis, md).toString(); // FIXME: shouldn't rely on toString()...
        if (mediaType.equals("application/x-gzip")) {
            return CodecType.GZIP;
        } else if (mediaType.equals("application/x-bzip2")) {
            return CodecType.BZIP2;
        }

        return CodecType.PLAIN_TEXT;
    }

    public void attemptGzipRead(String fileName) throws IOException {
        // check the gzip header
        @Cleanup
        GZIPInputStream in = new GZIPInputStream(submissionDirectory.open(fileName));

        // see if it can be read through
        byte[] buf = new byte[BUFFER_SIZE];
        while (in.read(buf) > 0) {
        }
    }

    public void attemptBzip2Read(String fileName) throws IOException {
        // check the bzip2 header

        BZip2Codec codec = new BZip2Codec();

        @Cleanup
        CompressionInputStream in = codec.createInputStream(submissionDirectory.open(fileName));

        // see if it can be read through
        byte[] buf = new byte[BUFFER_SIZE];
        while (in.read(buf) > 0) {
        }
    }

    /**
     * Files are expected to be present and uncorrupted at this stage.
     */
    @SneakyThrows
    public List<String> peekFileHeader(String fileName) {
        @Cleanup
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(submissionDirectory.getDecompressingInputStream(fileName)));
        String header = reader.readLine();
        header = (header == null) ? "" : header;
        return copyOf(FIELD_SPLITTER.split(header));
    }

}