com.google.cloud.genomics.dataflow.functions.verifybamid.ReadFunctions.java Source code

Java tutorial

Introduction

Here is the source code for com.google.cloud.genomics.dataflow.functions.verifybamid.ReadFunctions.java

Source

/*
 * Copyright 2015 Google.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.google.cloud.genomics.dataflow.functions.verifybamid;

import com.google.cloud.dataflow.sdk.transforms.SerializableFunction;
import com.google.cloud.genomics.dataflow.model.ReadBaseQuality;
import com.google.cloud.genomics.dataflow.model.ReadBaseWithReference;
import com.google.common.collect.ImmutableList;
import com.google.genomics.v1.CigarUnit;
import com.google.genomics.v1.Position;
import com.google.genomics.v1.Read;

import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Utility methods for working with genetic read data.
 */
public class ReadFunctions {

    private static final String UNINITIALIZED_REFERENCE_SEQUENCE = "UNINITIALIZED";

    /**
     * Use the given read to build a list of aligned read and reference base
     * information.
     *
     * @param read The read with the alignment information
     * @return read and reference information.  For a read without an alignment
     *            or cigar units, null is returned.
     */
    public static List<ReadBaseWithReference> extractReadBases(Read read) {

        // Make sure this read has a valid alignment with Cigar Units
        if (!read.hasAlignment() || (read.getAlignment().getCigarCount() == 0)) {
            return null;
        }

        ImmutableList.Builder<ReadBaseWithReference> bases = ImmutableList.builder();

        String readSeq = read.getAlignedSequence();
        List<Integer> readQual = read.getAlignedQualityList();
        String refSeq = UNINITIALIZED_REFERENCE_SEQUENCE;

        int refPosAbsoluteOffset = 0;
        int readOffset = 0;
        for (CigarUnit unit : read.getAlignment().getCigarList()) {
            switch (unit.getOperation()) {
            case ALIGNMENT_MATCH:
            case SEQUENCE_MISMATCH:
            case SEQUENCE_MATCH:
                for (int i = 0; i < unit.getOperationLength(); i++) {
                    String refBase = "";
                    if (unit.getOperation().equals(CigarUnit.Operation.SEQUENCE_MATCH)) {
                        refBase = readSeq.substring(readOffset, readOffset + 1);
                    } else if (!unit.getReferenceSequence().isEmpty()) {
                        // try to get the ref sequence from the Cigar unit
                        refBase = unit.getReferenceSequence().substring(i, i + 1);
                    } else {
                        // try to get the ref sequence by fully parsing the MD tag if not already cached
                        if (refSeq != null && refSeq.equals(UNINITIALIZED_REFERENCE_SEQUENCE)) {
                            refSeq = com.google.cloud.genomics.utils.grpc.ReadUtils
                                    .inferReferenceSequenceByParsingMdFlag(read);
                        }
                        if (refSeq != null) {
                            refBase = refSeq.substring(readOffset, readOffset + 1);
                        }
                    }
                    String name = read.getAlignment().getPosition().getReferenceName();
                    Matcher m = Pattern.compile("^(chr)?(X|Y|([12]?\\d))$").matcher(name);
                    if (m.matches()) {
                        name = m.group(m.groupCount() - 1);
                    }
                    Position refPosition = Position.newBuilder().setReferenceName(name)
                            .setPosition(read.getAlignment().getPosition().getPosition() + refPosAbsoluteOffset)
                            .build();
                    bases.add(new ReadBaseWithReference(
                            new ReadBaseQuality(readSeq.substring(readOffset, readOffset + 1),
                                    readQual.get(readOffset)),
                            refBase, refPosition));
                    refPosAbsoluteOffset++;
                    readOffset++;
                }
                break;

            case PAD: // padding (silent deletion from padded reference)
            case CLIP_HARD: // hard clipping (clipped sequences NOT present in seq)
                break;

            case CLIP_SOFT: // soft clipping (clipped sequences present in SEQ)
            case INSERT: // insertion to the reference
                readOffset += unit.getOperationLength();
                break;

            case DELETE: // deletion from the reference
            case SKIP: // intron (mRNA-to-genome alignment only)
                refPosAbsoluteOffset += unit.getOperationLength();
                break;

            default:
                throw new IllegalArgumentException("Illegal cigar code: " + unit.getOperation());
            }
        }

        return bases.build();
    }

    /**
     * Make sure a read is from a chromosome.
     */
    public static final SerializableFunction<Read, Boolean> IS_ON_CHROMOSOME = new SerializableFunction<Read, Boolean>() {
        @Override
        public Boolean apply(Read r) {
            return Pattern.compile("^(chr)?(X|Y|([12]?\\d))$")
                    .matcher(r.getAlignment().getPosition().getReferenceName()).matches();
        }
    };

    /**
     * Make sure a read passed QC.
     */
    public static final SerializableFunction<Read, Boolean> IS_NOT_QC_FAILURE = new SerializableFunction<Read, Boolean>() {
        @Override
        public Boolean apply(Read r) {
            return !r.getFailedVendorQualityChecks();
        }
    };

    /**
     * Make sure a read is not a duplicate.
     */
    public static final SerializableFunction<Read, Boolean> IS_NOT_DUPLICATE = new SerializableFunction<Read, Boolean>() {
        @Override
        public Boolean apply(Read r) {
            return !r.getDuplicateFragment();
        }
    };

    /**
     * Make sure the read alignment is proper (paired end reads facing each other with acceptable
     * fragment size).
     */
    public static final SerializableFunction<Read, Boolean> IS_PROPER_PLACEMENT = new SerializableFunction<Read, Boolean>() {
        @Override
        public Boolean apply(Read r) {
            return r.getProperPlacement();
        }
    };
}