org.opencb.hpg.bigdata.core.io.VcfBlockIterator.java Source code

Java tutorial

Introduction

Here is the source code for org.opencb.hpg.bigdata.core.io.VcfBlockIterator.java

Source

/*
 * Copyright 2015 OpenCB
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 *
 */
package org.opencb.hpg.bigdata.core.io;

import htsjdk.tribble.readers.LineIterator;
import htsjdk.variant.vcf.VCFHeader;
import htsjdk.variant.vcf.VCFHeaderVersion;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.NotImplementedException;
import org.opencb.biodata.formats.variant.vcf4.FullVcfCodec;
import org.opencb.commons.io.DataReader;

import java.io.*;
import java.nio.CharBuffer;
import java.util.*;
import java.util.concurrent.atomic.AtomicLong;
import java.util.zip.GZIPInputStream;

/**
 * @author mh719
 *
 */
public class VcfBlockIterator implements AutoCloseable, Iterator<List<CharBuffer>>, Iterable<List<CharBuffer>> {
    private static final long DEFAULT_64KB_BLOCK = 64L * 1024L;

    private final File file;
    private final InputStream in;
    private final LineIterator iter;

    private final AtomicLong charBlockSize = new AtomicLong(DEFAULT_64KB_BLOCK);

    private final VCFHeader header;

    private final VCFHeaderVersion version;

    public VcfBlockIterator(File vcfFile) throws IOException {
        this(vcfFile, new FullVcfCodec());
    }

    public VcfBlockIterator(File vcfFile, FullVcfCodec codec) throws IOException {
        this.file = vcfFile;
        this.in = buildInputStream(this.file);
        this.iter = codec.makeSourceFromStream(this.in);
        this.header = (VCFHeader) codec.readActualHeader(this.iter);
        this.version = codec.getVCFHeaderVersion();
    }

    public VcfBlockIterator(InputStream in, FullVcfCodec codec) throws IOException {
        this.file = null;
        this.in = in;
        this.iter = codec.makeSourceFromStream(this.in);
        this.header = (VCFHeader) codec.readActualHeader(this.iter);
        this.version = codec.getVCFHeaderVersion();
    }

    public VCFHeader getHeader() {
        return this.header;
    }

    public VCFHeaderVersion getVersion() {
        return version;
    }

    @Override
    public List<CharBuffer> next() {
        return next(this.charBlockSize.get());
    }

    public List<CharBuffer> next(long blockSize) {
        long cnt = 0L;
        List<CharBuffer> next = new LinkedList<>(); // linked list faster at creation time
        while (iter.hasNext() && cnt < blockSize) {
            String line = iter.next();
            CharBuffer buff = CharBuffer.wrap(line.toCharArray()); //FIXME! Avoid char array copy
            next.add(buff);
            cnt += buff.length();
        }
        return next;
    }

    @Override
    public boolean hasNext() {
        return iter.hasNext();
    }

    protected InputStream buildInputStream(File inFile) throws IOException {
        InputStream inputStream = new FileInputStream(inFile);
        String name = inFile.getName();
        String ext = FilenameUtils.getExtension(name);
        switch (ext) {
        case "gz":
        case "gzip":
            inputStream = new GZIPInputStream(inputStream);
            break;
        case "vcf":
        case "txt":
        case "tsv":
            //nothing to do
            break;
        default:
            throw new NotImplementedException(String.format("Compression extension %s not yet supported!!!", ext));
        }
        return new BufferedInputStream(inputStream);
    }

    @Override
    public void close() throws IOException {
        this.in.close();
    }

    @Override
    public void remove() {
        throw new NotImplementedException("Remove not implemented");
    }

    @Override
    public Iterator<List<CharBuffer>> iterator() {
        return this;
    }

    public DataReader<CharBuffer> toCharBufferDataReader() {
        return new DataReader<CharBuffer>() {
            @Override
            public List<CharBuffer> read(int size) {
                return (hasNext() ? next(size) : Collections.<CharBuffer>emptyList());
            }

            @Override
            public boolean close() {
                try {
                    VcfBlockIterator.this.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                return true;
            }
        };
    }

    public DataReader<CharSequence> toLineDataReader() {
        return new DataReader<CharSequence>() {
            @Override
            public List<CharSequence> read(int size) {
                List<CharSequence> batch = new ArrayList<>(size);
                for (int i = 0; i < size && iter.hasNext(); i++) {
                    batch.add(iter.next());
                }
                return batch;
            }

            @Override
            public boolean close() {
                try {
                    VcfBlockIterator.this.close();
                } catch (IOException e) {
                    throw new RuntimeException(e);
                }
                return true;
            }
        };
    }
}