ubic.gemma.core.loader.expression.arrayDesign.AffyChipTypeExtractor.java Source code

Introduction

Here is the source code for ubic.gemma.core.loader.expression.arrayDesign.AffyChipTypeExtractor.java
Source

/*
 * The Gemma project
 *
 * Copyright (c) 2011 University of British Columbia
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 * specific language governing permissions and limitations under the License.
 */
package ubic.gemma.core.loader.expression.arrayDesign;

import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import ubic.basecode.util.FileTools;
import ubic.gemma.core.analysis.preprocess.batcheffects.BatchInfoParser;
import ubic.gemma.core.loader.expression.AffyPowerToolsProbesetSummarize;
import ubic.gemma.model.common.description.LocalFile;
import ubic.gemma.model.expression.bioAssay.BioAssay;
import ubic.gemma.model.expression.experiment.ExpressionExperiment;

import java.io.*;
import java.nio.*;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Extract the chip type from Affymetrix CEL files. Handles both version 3 (ASCII) and 4 (binary) files.
 * <p>
 * See <a href='http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/cel.html'>CEL</a> and
 * <a href='http://www.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/generic.html'>GENERIC</a>
 * </p>
 * Note that the Affymetrix documentation does not mention a chip type, explicitly, but it's in the "DatHeader", and in
 * GCOS files as affymetrix-array-type
 *
 * @author paul
 */
public class AffyChipTypeExtractor {

    private static final Log log = LogFactory.getLog(AffyChipTypeExtractor.class);
    private static final String STANDARD_FORMAT_REGEX = ".+?\\s([\\w\\-_]+)\\.1sq.+";

    /**
     * Extract a string like "Rat230_2" from CEL files. This is to help match BioAssays to platforms when reanalyzing
     * from CEL, especially if the original platform information has been lost (e.g., by switching to a merged platform,
     * or from a custom CDF-based version)
     * 
     * @param ee experiment
     * @param files CEL files
     * @return map of bioassays to cel identifiers
     */
    public static Map<BioAssay, String> getChipTypes(ExpressionExperiment ee, Collection<LocalFile> files) {

        Map<String, BioAssay> assayAccessions = BatchInfoParser.getAccessionToBioAssayMap(ee);

        if (assayAccessions.isEmpty()) {
            throw new UnsupportedOperationException(
                    "Couldn't get any accessions for bioassays of: " + ee.getShortName());
        }

        Map<BioAssay, File> bioAssays2Files = matchBioAssaysToRawDataFiles(files, assayAccessions);

        /*
         * Check if we should go on
         */
        if (bioAssays2Files.size() < assayAccessions.size()) {

            if (bioAssays2Files.size() > 0) {
                /*
                 * Missing a few for some reason.
                 */
                for (BioAssay ba : bioAssays2Files.keySet()) {
                    if (!assayAccessions.containsKey(ba.getAccession().getAccession())) {
                        log.warn("Missing raw data file for " + ba + " on " + ee.getShortName());
                    }
                }
            }

            throw new IllegalStateException("Did not get enough raw files :got " + bioAssays2Files.size()
                    + ", expected " + assayAccessions.size() + " while processing " + ee.getShortName());
        }

        try {
            return getChipTypesFromFiles(bioAssays2Files);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    /**
     * @param bioAssays2Files map to files
     * @return map of BAs to chip names
     */
    private static Map<BioAssay, String> getChipTypesFromFiles(Map<BioAssay, File> bioAssays2Files)
            throws IOException {
        Map<BioAssay, String> result = new HashMap<>();
        for (BioAssay ba : bioAssays2Files.keySet()) {
            File f = bioAssays2Files.get(ba);
            try (InputStream is = FileTools.getInputStreamFromPlainOrCompressedFile(f.getAbsolutePath())) {

                String chiptype = extract(is);

                if (chiptype == null) {
                    log.warn("No chip name type found in " + f.getName() + " for " + ba);
                    continue;
                }

                result.put(ba, chiptype);

            }
        }

        return result;
    }

    /**
     * @param files files
     * @param assayAccessions accessions
     * @return mapt od BAs to files
     */
    private static Map<BioAssay, File> matchBioAssaysToRawDataFiles(Collection<LocalFile> files,
            Map<String, BioAssay> assayAccessions) {

        Map<BioAssay, File> result = new HashMap<>();
        for (LocalFile lf : files) {

            BioAssay bioAssay = AffyPowerToolsProbesetSummarize.matchBioAssayToCelFileName(assayAccessions,
                    lf.asFile().getName());
            if (bioAssay == null) {
                continue;
            }
            log.info("BioAssay found for " + lf.asFile().getName() + " (" + bioAssay + ")");
            result.put(bioAssay, lf.asFile());
        }
        return result;

    }

    /**
     * @param is input stream
     * @return text
     */
    public static String extract(InputStream is) {

        String chipType = null;
        try (DataInputStream str = new DataInputStream(is)) {

            BufferedReader reader;

            int magic = readByte(str);
            switch (magic) {
            case 64: {

                int version = readIntLittleEndian(str);

                if (version != 4) {
                    // it's always supposed to be.
                    throw new IllegalStateException("Affymetrix CEL format not recognized: " + version);
                }

                AffyChipTypeExtractor.log.debug(readShort(str)); // numrows

                AffyChipTypeExtractor.log.debug(readShort(str)); // numcols

                AffyChipTypeExtractor.log.debug(readIntLittleEndian(str)); // numcells

                int headerLen = readShort(str);

                if (headerLen == 0) {
                    // throw new IllegalStateException( "Zero header length read" );
                    headerLen = 800;
                }

                AffyChipTypeExtractor.log.debug(headerLen);

                StringBuilder buf = new StringBuilder();

                for (int i = 0; i < headerLen; i++) {
                    buf.append(new String(new byte[] { str.readByte() }, "US-ASCII"));
                }

                String[] headerLines = StringUtils.split(buf.toString(), "\n");

                for (String string : headerLines) {
                    if (string.startsWith("DatHeader")) {
                        chipType = parseStandardFormat(string);
                        break;
                    }
                }
                break;
            }
            case 59: {

                // Command Console format
                int version = readUnsignedByte(str);
                if (version != 1) {
                    // this is fixed to 1 according to affy docs.
                    throw new IllegalStateException("Affymetrix CEL format not recognized: " + version);
                }
                @SuppressWarnings("unused")
                int numDataGroups = readIntBigEndian(str); // number of data groups, usually = 1. Each data group

                // contains another header, with different name/value/type
                // triples.
                @SuppressWarnings("unused")
                int filePosOfFirstGroup = readIntBigEndian(str); // file position of first data group.

                chipType = parseGenericCCHeader(str);

                // need to find number of parent file headers and then read array of generic file headers

                if (chipType == null) {
                    /*
                     * Look in the parentheader
                     */

                    chipType = parseGenericCCHeader(str);
                }

                // $dataHeader$parents[[1]]$parameters$`affymetrix-scan-date`

                // also $dataHeader$parents[[1]]$parameters$`affymetrix-Fluidics-HybDate`

                break;
            }
            default:

                /*
                 * assume version 3 plain text.
                 */
                reader = new BufferedReader(new InputStreamReader(is));
                String line;
                int count = 0;
                while ((line = reader.readLine()) != null) {
                    // log.info( line );
                    if (line.startsWith("DatHeader")) {
                        chipType = parseStandardFormat(line);
                    }
                    if (chipType != null || ++count > 100) { // don't read too much.
                        reader.close();
                        break;
                    }
                }
                break;
            }

            return chipType;

        } catch (IOException e) {
            throw new RuntimeException(e);
        }

    }

    private static String parseGenericCCHeader(DataInputStream str) throws IOException {

        /*
         * acquisition data, intensity data etc. Usually "intensity data" for the first header.
         */
        String datatypeIdentifier = readString(str);

        AffyChipTypeExtractor.log.debug(datatypeIdentifier);

        String guid = readString(str);

        AffyChipTypeExtractor.log.debug(guid);

        // we just need to read thsee off, even if we aren't using it.
        @SuppressWarnings("unused")
        String createDate = readUnicodeString(str); // blank?
        @SuppressWarnings("unused")
        String locale = readUnicodeString(str); // e.g. en-US
        int numKeyValuePairs = readIntBigEndian(str); // e.g. 55
        String result = null;
        for (int i = 0; i < numKeyValuePairs; i++) {
            String name = readUnicodeString(str);
            byte[] value = readBytes(str);
            String type = readUnicodeString(str);
            Object v;

            switch (type) {
            case "text/x-calvin-float": {
                FloatBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asFloatBuffer();
                float[] array = new float[intBuf.remaining()];
                intBuf.get(array);
                break;
            }
            case "text/plain":
            case "text/ascii":
                // text/ascii is undocumented, but needed.
                v = new String(value, "US-ASCII");
                String vv = new String(((String) v).getBytes(), "UTF-16").trim();

                if (name.equals("affymetrix-array-type")) {
                    return vv;
                }

                break;
            case "text-x-calvin-unsigned-integer-8": {
                ShortBuffer intBuf = ByteBuffer.wrap(value).asShortBuffer();
                short[] array = new short[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-integer-16": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer(); // wrong?

                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);
                break;
            }
            case "text/x-calvin-integer-32": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer();
                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-unsigned-integer-8": {
                ShortBuffer intBuf = ByteBuffer.wrap(value).asShortBuffer();
                short[] array = new short[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-unsigned-integer-16": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer();// wrong?

                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            case "text/x-calvin-unsigned-integer-32": {
                IntBuffer intBuf = ByteBuffer.wrap(value).order(ByteOrder.BIG_ENDIAN).asIntBuffer();
                int[] array = new int[intBuf.remaining()];
                intBuf.get(array);

                break;
            }
            default:
                throw new IOException("Unknown mime type:" + type);
            }

        }

        @SuppressWarnings("unused")
        int numParentHeaders = readIntBigEndian(str);
        return result;
    }

    /**
     * @param string string to be parsed
     * @return parsed string
     */
    private static String parseStandardFormat(String string) {
        Pattern regex = Pattern.compile(STANDARD_FORMAT_REGEX);

        Matcher matcher = regex.matcher(string);
        if (matcher.matches()) {
            return matcher.group(1);
        }
        return null;
    }

    /**
     * @return 8 bit signed integral number
     */
    private static int readByte(DataInputStream dis) throws IOException {
        return dis.readByte();
    }

    /**
     * The data is stored as a int, then the array of bytes.
     */
    private static byte[] readBytes(DataInputStream str) throws IOException {
        int fieldLength = readIntBigEndian(str);

        byte[] result = new byte[fieldLength];
        for (int i = 0; i < fieldLength; i++) {
            if (str.available() == 0)
                throw new IOException("Reached end of file without string end");
            result[i] = str.readByte();
        }

        return result;
    }

    private static int readIntBigEndian(DataInputStream dis) throws IOException {
        byte[] buf = new byte[4];

        for (int i = 0; i < buf.length; i++) {
            buf[i] = dis.readByte();
        }

        return ByteBuffer.wrap(buf).order(ByteOrder.BIG_ENDIAN).getInt();
    }

    /**
     * @return 32 bit signed integral number
     */
    private static int readIntLittleEndian(DataInputStream dis) throws IOException {
        return dis.readInt();
    }

    private static int readShort(DataInputStream dis) throws IOException {
        return dis.readShort();
    }

    /**
     * @return A 1 byte character string. A string object is stored as an INT (to store the string length) followed by
     *         the CHAR
     *         array (to store the string contents).
     */
    private static String readString(DataInputStream str) throws IOException {
        int fieldLength = readIntBigEndian(str);
        StringBuilder buf = new StringBuilder();
        for (int i = 0; i < fieldLength; i++) {
            if (str.available() == 0)
                throw new IOException("Reached end of file without string end");
            buf.append(new String(new byte[] { str.readByte() }));
        }
        return buf.toString();
    }

    /**
     * @return A UNICODE string. A string object is stored as an INT (to store the string length) followed by the WCHAR
     *         array
     *         (to store the string contents).
     *         (http://media.affymetrix.com/support/developer/powertools/changelog/gcos-agcc/generic.html)
     */
    private static String readUnicodeString(DataInputStream str) throws IOException {
        int fieldLength = readIntBigEndian(str);

        // log.info( fieldLength );

        byte[] buf = new byte[fieldLength * 2];

        // StringBuilder buf = new StringBuilder();
        for (int i = 0; i < fieldLength * 2; i++) {
            buf[i] = str.readByte();
        }

        return new String(buf, "UTF-16");
    }

    private static int readUnsignedByte(DataInputStream dis) throws IOException {
        return dis.readUnsignedByte();
    }

}