Java tutorial
/* dsh-bio-variant Variants. Copyright (c) 2013-2019 held jointly by the individual authors. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA. > http://www.fsf.org/licensing/licenses/lgpl.html > http://www.opensource.org/licenses/lgpl-license.php */ package org.dishevelled.bio.variant.vcf; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.collect.Maps.newHashMap; import java.io.IOException; import java.util.Collection; import java.util.Map; import com.google.common.io.CharStreams; import com.google.common.io.LineProcessor; /** * Low-level VCF parser. * * @author Michael Heuer */ public final class VcfParser { /** * Private no-arg constructor. */ private VcfParser() { // empty } /** * Parse the specified readable. * * @param readable readable, must not be null * @param listener low-level event based parser callback, must not be null * @throws IOException if an I/O error occurs */ public static void parse(final Readable readable, final VcfParseListener listener) throws IOException { checkNotNull(readable); VcfLineProcessor lineProcessor = new VcfLineProcessor(listener); CharStreams.readLines(readable, lineProcessor); } /** * VCF line processor. */ private static final class VcfLineProcessor implements LineProcessor<Object> { /** Line number. */ private long lineNumber = 0; /** VCF parse listener. */ private final VcfParseListener listener; /** Map of sample names keyed by column. */ private final Map<Integer, String> samples = newHashMap(); /** * Create a new VCF line processor. * * @param listener VCF parse listener */ private VcfLineProcessor(final VcfParseListener listener) { checkNotNull(listener); this.listener = listener; } @Override public Object getResult() { return null; } @Override public boolean processLine(final String line) throws IOException { lineNumber++; // consider using guava Splitter String[] tokens = line.split("\t"); // skip blank lines if (tokens.length == 0) { return true; } if (tokens[0].startsWith("##")) { // meta-information lines listener.meta(line); } else if (tokens[0].startsWith("#CHROM")) { // header line if (tokens.length > 8) { for (int column = 9, columns = tokens.length; column < columns; column++) { samples.put(column, tokens[column]); } } Collection<String> values = samples.values(); listener.samples(values.toArray(new String[values.size()])); } else { // data lines listener.lineNumber(lineNumber); if (tokens.length < 8) { throw new IOException("invalid data line at line number " + lineNumber + ", expected 8 tokens, found " + tokens.length); } listener.chrom(tokens[0]); try { listener.pos(Long.parseLong(tokens[1])); } catch (NumberFormatException e) { throw new IOException("invalid pos at line number " + lineNumber, e); } String[] idTokens = tokens[2].split(";"); listener.id(isMissingValue(idTokens) ? new String[0] : idTokens); listener.ref(tokens[3]); String[] altTokens = tokens[4].split(","); listener.alt(isMissingValue(altTokens) ? new String[0] : altTokens); try { listener.qual(isMissingValue(tokens[5]) ? null : Double.parseDouble(tokens[5])); } catch (NumberFormatException e) { throw new IOException("invalid qual at line number " + lineNumber, e); } String[] filterTokens = tokens[6].split(";"); listener.filter(isMissingValue(filterTokens) ? new String[0] : filterTokens); String[] infoTokens = tokens[7].split(";"); if (!isMissingValue(infoTokens)) { for (String infoToken : infoTokens) { String[] entryTokens = infoToken.split("="); if (entryTokens.length == 1) { listener.info(entryTokens[0], "true"); } else if (entryTokens.length == 2) { String infoId = entryTokens[0]; String value = entryTokens[1]; if (isMissingValue(value)) { listener.info(infoId); } else { listener.info(infoId, value.split(",")); } } } } if (tokens.length > 8) { String[] formatTokens = tokens[8].split(":"); listener.format(formatTokens); for (int column = 9, columns = tokens.length; column < columns; column++) { String[] genotypeTokens = tokens[column].split(":"); if (genotypeTokens.length > formatTokens.length) { throw new IOException("invalid genotype fields at line number " + lineNumber + ", too many genotype fields"); } if (formatTokens.length > 0 && "GT".equals(formatTokens[0])) { if (genotypeTokens.length == 0) { throw new IOException("invalid genotype fields at line number " + lineNumber + ", missing genotype (GT) field"); } String gt = genotypeTokens[0]; listener.genotype(samples.get(column), "GT", gt); } for (int i = 1, size = Math.min(formatTokens.length, genotypeTokens.length); i < size; i++) { if (!isMissingValue(genotypeTokens[i])) { listener.genotype(samples.get(column), formatTokens[i], genotypeTokens[i].split(",")); } } } } if (!listener.complete()) { return false; } } return true; } } /** * Return true if the specified value is the missing value (<code>"."</code>). * * @param value value * @return true if the specified value is the missing value (<code>"."</code>) */ static boolean isMissingValue(final String value) { return ".".equals(value); } /** * Return true if the specified array of values is the missing value (<code>"."</code>). * * @param values array of values * @return true if the specified array of values is the missing value (<code>"."</code>) */ static boolean isMissingValue(final String[] values) { return (values.length == 1 && ".".equals(values[0])); } }