edu.berkeley.compbio.sequtils.strings.MarkovTreeNode.java Source code

Introduction

Here is the source code for edu.berkeley.compbio.sequtils.strings.MarkovTreeNode.java
Source

/*
 * Copyright (c) 2007-2013  David Soergel  <dev@davidsoergel.com>
 * Licensed under the Apache License, Version 2.0
 * http://www.apache.org/licenses/LICENSE-2.0
 */

package edu.berkeley.compbio.sequtils.strings;

import com.davidsoergel.dsutils.AbstractGenericFactoryAware;
import com.davidsoergel.dsutils.DSArrayUtils;
import com.davidsoergel.dsutils.LabellableImpl;
import com.davidsoergel.dsutils.collections.MutableWeightedSet;
import com.davidsoergel.dsutils.collections.WeightedSet;
import com.davidsoergel.dsutils.math.MathUtils;
import com.davidsoergel.stats.DistributionException;
import com.davidsoergel.stats.Multinomial;
import com.davidsoergel.stats.MutableDistribution;
import edu.berkeley.compbio.sequtils.FilterException;
import edu.berkeley.compbio.sequtils.NotEnoughSequenceException;
import edu.berkeley.compbio.sequtils.SequenceReader;
import org.apache.commons.collections.buffer.CircularFifoBuffer;
import org.apache.commons.lang.NotImplementedException;
import org.apache.log4j.Logger;
import org.jetbrains.annotations.NotNull;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Formatter;
import java.util.List;
import java.util.Locale;

/**
 * A node in a tree where each transition is labeled by a byte and has a local probability (that is, conditional on the
 * parent node).  If the tree is completely filled out with respect to its alphabet up to a depth k, then it represents
 * a k-order Markov model.  If it is incompletely filled out, it may represent a Variable Memory Markov model (VMM).
 *
 * @author <a href="mailto:dev@davidsoergel.com">David Soergel</a>
 * @version $Id: MarkovTreeNode.java 519 2009-10-08 07:01:28Z soergel $
 * @see RonPSANode
 *      <p/>
 *      This class should not be used to represent a Probabilistic Suffix Tree, because that is likely to be confusing.
 */
public class MarkovTreeNode extends AbstractGenericFactoryAware
        implements SequenceSpectrum<MarkovTreeNode>, MutableDistribution {
    // ------------------------------ FIELDS ------------------------------

    private LabellableImpl<String> labels = new LabellableImpl<String>();

    public void doneLabelling() {
        labels.doneLabelling();
    }

    @NotNull
    public WeightedSet<String> getImmutableWeightedLabels() {
        return labels.getImmutableWeightedLabels();
    }

    @NotNull
    public MutableWeightedSet<String> getMutableWeightedLabels() {
        return labels.getMutableWeightedLabels();
    }

    public int getItemCount() {
        return labels.getItemCount();
    }

    private static final Logger logger = Logger.getLogger(MarkovTreeNode.class);
    protected byte[] id;
    protected byte[] alphabet;

    // use the alphabet to get indexes into the child array, instead of the HashM'p, for efficiency (no autoboxing, etc)

    //   private Map<Byte, MarkovTreeNode> children = new HashMap<Byte, MarkovTreeNode>();

    // includes only the children, with nulls where the tree does not continue
    protected MarkovTreeNode[] children;

    protected Multinomial<Byte> probs = new Multinomial<Byte>();

    protected boolean leaf = true;
    private double[] logprobs;

    private String label;

    //   private final MutableWeightedSet<String> weightedLabels = new ConcurrentHashWeightedSet<String>();

    /**
     * Returns the maximum length of substrings considered in computing this statistical model of the sequence.  Our
     * implicit assumption is that the sequences being modeled have some correlation length, and thus that statistical
     * models of them can be built from substrings up to that length.  Thus, this method tells the maximum correlation
     * length provided by the model.  A manifestation of this is that conditional probabilities of symbols given a prefix
     * will cease to change as the prefix is lengthened (to the left) past this length.
     *
     * @return the maximum correlation length considered in the model.
     */
    /*   public int getMaxDepth()
       {
       // inefficient; could be cached
        
       int result = 0;
       if (probs.size() > 0)
     {
     result = 1;
     }
       for (MarkovTreeNode child : children)//.values())
     {
     if (child != null)
        {
        result = Math.max(result, child.getMaxDepth() + 1);
        }
     }
       return result;
       }*/

    // diagnostics
    private int total = 0, leaves = 0, maxdepth = 0;

    private double avgdepth = 0;

    private long originalSequenceLength;
    private long unknownCount;

    // --------------------------- CONSTRUCTORS ---------------------------

    public MarkovTreeNode() {
    }

    /**
     * Constructs a new MarkovTreeNode with the given identifier
     *
     * @param id the sequence of symbols leading to this node
     */
    public MarkovTreeNode(final byte[] id, final byte[] alphabet) {
        this.id = id;
        setAlphabet(alphabet);
    }

    public void setAlphabet(final byte[] alphabet) {
        this.alphabet = alphabet;
        logprobs = new double[alphabet.length];
        children = new MarkovTreeNode[alphabet.length];
    }

    // --------------------- GETTER / SETTER METHODS ---------------------

    /**
     * Returns the alphabet of this SequenceSpectrum object.
     *
     * @return the alphabet (type byte[]) of this SequenceSpectrum object.
     */
    public byte[] getAlphabet() {
        return alphabet;
    }

    public MarkovTreeNode[] getChildren() {
        return children;
    }

    /**
     * Returns the number of samples on which this spectrum is based.
     *
     * @return The number of samples
     */
    /*   public int getNumberOfSamples()
        {
        throw new NotImplementedException();
        }
    */
    public long getOriginalSequenceLength() {
        return originalSequenceLength;
    }

    public void setOriginalSequenceLength(final long originalSequenceLength) {
        this.originalSequenceLength = originalSequenceLength;
    }

    public long getUnknownCount() {
        return unknownCount;
    }

    public void setUnknownCount(final long unknownCount) {
        this.unknownCount = unknownCount;
    }

    /**
     * Returns a mapping of symbols to child nodes, describing the possible transitions from this node to its children.
     * Does not provide any information about the probabilities associated with those transitions.
     *
     * @return a Map<Byte, MarkovTreeNode> describing the possible transitions from this node to its children.
     */
    /*   public Map<Byte, MarkovTreeNode> getChildren()
       {
       return children;
       }*/
    public Multinomial<Byte> getProbs() {
        return probs;
    }

    public boolean isLeaf() {
        return leaf;
    }

    public void setId(final byte[] id) {
        this.id = id;
    }

    public void setLabel(final String label) {
        this.label = label;
    }

    // ------------------------ CANONICAL METHODS ------------------------

    /**
     * Performs a deep copy of this node.
     *
     * @return a copy of this node, including a deep copy of its children and their probabilities
     */
    @Override
    public MarkovTreeNode clone() {
        try {
            final MarkovTreeNode result = (MarkovTreeNode) super.clone();//new MarkovTreeNode(id, alphabet);
            result.setAlphabet(alphabet);
            result.probs = probs.clone();
            for (final byte b : alphabet)//children.keySet())
            {
                final MarkovTreeNode child = getChild(b);
                if (child != null) {
                    result.addChild(b, child.clone());
                }
            }
            return result;
        } catch (CloneNotSupportedException e) {
            logger.error("Error", e);
            throw new Error("Impossible");
        }
        /*      try
              {
              result.setProb(b, probs.get(b));
              }
           catch (DistributionException e)
              {
              // no problem, just leave it empty then
              }*/

        /*   try
           {
           result.normalize();
           }
        catch (DistributionException e)
           {
           throw new SequenceSpectrumRuntimeException(e);
           }*/
    }

    /**
     * Gets the child node associated with the given sequence, if it exists
     *
     * @param sigma the symbol to follow from this node
     * @return the node pointed to, or null if that leaf does not exist
     */
    public MarkovTreeNode getChild(final byte sigma)//throws SequenceSpectrumException
    {
        //return children == null ? null :
        return children[DSArrayUtils.indexOf(alphabet, sigma)];
    }

    private void addChild(final byte b, final MarkovTreeNode child) {
        leaf = false;
        final int childIndex = DSArrayUtils.indexOf(alphabet, b);
        children[childIndex] = child;
    }

    // ------------------------ INTERFACE METHODS ------------------------

    // --------------------- Interface AdditiveClusterable ---------------------

    public void decrementBy(final MarkovTreeNode object) {
        throw new NotImplementedException();
    }

    public void decrementByWeighted(final MarkovTreeNode object, final double weight) {
        throw new NotImplementedException();
    }

    public void incrementBy(final MarkovTreeNode object) {
        throw new NotImplementedException();
    }

    public void incrementByWeighted(final MarkovTreeNode object, final double weight) {
        throw new NotImplementedException();
    }

    public MarkovTreeNode minus(final MarkovTreeNode object) {
        throw new NotImplementedException();
    }

    public void multiplyBy(final double v) {
        throw new NotImplementedException();
    }

    public MarkovTreeNode plus(final MarkovTreeNode object) {
        throw new NotImplementedException();
    }

    public MarkovTreeNode times(final double v) {
        throw new NotImplementedException();
    }

    // --------------------- Interface Clusterable ---------------------

    /**
     * Recursively tests the deep equality of two PSTNodes.  Requires that the id, the alphabet, the tree structure, and
     * the transition probabilities are identical.
     *
     * @param other the MarkovTreeNode to compare
     * @return true if the trees have the same structure and transition probabilities, false otherwise
     */
    public boolean equalValue(final MarkovTreeNode other) {
        if (!Arrays.equals(id, other.id)) {
            return false;
        }
        if (!Arrays.equals(alphabet, other.alphabet)) {
            return false;
        }
        /*   if (children.size() != other.children.size())
           {
           return false;
           }*/
        for (final byte b : alphabet)//children.keySet())
        {
            double v = 0;
            int unknown = 0;
            try {
                v = probs.get(b);
            } catch (DistributionException e) {
                unknown++;
            }
            double v1 = 0;
            try {
                v1 = other.probs.get(b);
            } catch (DistributionException e1) {
                unknown++;
            }
            if (unknown == 1) {
                return false;
            }
            if (unknown == 0 && !MathUtils.equalWithinFPError(v, v1)) {
                return false;
            }
            //MarkovTreeNode node = null;
            //MarkovTreeNode node1 = null;
            //   try
            //      {
            final MarkovTreeNode node = getChild(b);
            final MarkovTreeNode node1 = other.getChild(b);
            //      }
            //   catch (SequenceSpectrumException e)
            //      {
            //      throw new Error("Impossible");
            //      }
            if (node != null && node1 == null) {
                return false;
            }
            if (node == null && node1 != null) {
                return false;
            }
            if (node != null && node1 != null && !node.equalValue(node1)) {
                return false;
            }
        }
        return true;
    }

    /**
     * Returns a String representing the sequence of symbols leading to this node in the tree. The id is internally a
     * byte[]; we just make a String out of it for the sake of the interface.
     *
     * @return a unique identifier for this object
     */
    public String getId() {
        return new String(id);
        //throw new NotImplementedException();
    }

    // --------------------- Interface SequenceSpectrum ---------------------

    /**
     * Computes the conditional probability distribution of symbols given a prefix under the model.
     *
     * @param prefix a byte array providing the conditioning prefix
     * @return the Multinomial conditional distribution of symbols following the given prefix
     * @throws edu.berkeley.compbio.sequtils.strings.SequenceSpectrumException
     *          when anything goes wrong
     */
    public Multinomial<Byte> conditionalsFrom(final byte[] prefix) throws SequenceSpectrumException {
        final MarkovTreeNode node = get(prefix);
        if (node == null || node.probs == null) {
            throw new SequenceSpectrumException("Unknown probabilities at " + Arrays.toString(prefix));
        }
        return node.probs;
    }

    /**
     * Computes the total log probability of generating the given sequence fragment under the model.  This differs from
     * {@link #totalProbability(byte[])} in that the sequence fragment is not given explicitly but only as metadata.  Thus
     * its probability may be computed from summary statistics that are already available in the given SequenceFragment
     * rather than from the raw sequence.  Also, because these probabilities are typically very small, the result is
     * returned in log space (indeed implementations will likely compute them in log space).
     *
     * @param sequenceFragment the SequenceFragment whose probability is to be computed
     * @return the natural logarithm of the conditional probability (a double value between 0 and 1, inclusive)
     */
    public double fragmentLogProbability(final SequenceFragment sequenceFragment, final boolean perSample)
            throws SequenceSpectrumException {
        // the RonPSA implementation uses backlinks and so is vastly more efficient.
        // We can't use backlinks here because they might point to nodes outside of this subtree

        synchronized (sequenceFragment.getReaderForSynchronizing()) // because of resetting the reader
        {
            final SequenceReader in;
            try {
                in = sequenceFragment.getResetReader();
            } catch (NotEnoughSequenceException e) {
                throw new SequenceSpectrumRuntimeException(e);
            }
            final int requiredPrefixLength = getMaxDepth() - 1;
            double logprob = 0;
            final CircularFifoBuffer prefix = new CircularFifoBuffer(requiredPrefixLength);

            int samples = 0;
            while (true) {
                try {
                    final byte c = in.read();

                    try {
                        // PERF converting array prefix from circularFifoBuffer to byte[] is terribly inefficient
                        final byte[] prefixAsBytes = DSArrayUtils
                                .toPrimitive((Byte[]) prefix.toArray(new Byte[prefix.size()]));

                        // these log probabilities could be cached, e.g. logConditionalProbability(c, prefix)
                        logprob += MathUtils.approximateLog(conditionalProbability(c, prefixAsBytes));

                        samples++;

                        prefix.add(c);
                    } catch (SequenceSpectrumException e) {
                        // probably just an invalid character
                        logger.debug("Invalid character " + (char) c);
                        // ignore this character as far as the probability is concerned
                        prefix.clear();
                    }
                } catch (NotEnoughSequenceException e) {
                    break;
                } catch (IOException e) {
                    logger.error("Error", e);
                    throw new SequenceSpectrumException(e);
                } catch (FilterException e) {
                    logger.error("Error", e);
                    throw new SequenceSpectrumException(e);
                }
            }

            if (perSample) {
                // we have ln(product(p) == sum(ln(p)).
                // The geometric mean is exp(sum(ln(p))/n), so to get ln(geometric mean) we need only divide by n.
                logprob /= samples;
            }

            return logprob;
        }
    }

    /**
     * Careful-- this gets cached on the first call so will be invalid later if the tree is changed
     *
     * @return
     */
    public int getMaxDepth() {
        if (maxdepth == 0) // not computed yet;  node has a depth of at least 1
        {
            diagnostics();
        }
        return maxdepth;
    }

    /**
     * Chooses a random symbol according to the conditional probabilities of symbols following the given prefix.  Shortcut
     * equivalent to conditionalsFrom(prefix).sample().byteValue()
     *
     * @param prefix a byte array providing the conditioning prefix
     * @return the chosen symbol
     */
    public byte sample(final byte[] prefix) throws SequenceSpectrumRuntimeException {
        try {
            return conditionalsFrom(prefix).sample();
            //return probs.sample();
        } catch (SequenceSpectrumException e) {
            return sample(DSArrayUtils.suffix(prefix, 1));
            /*
              logger.error("Error", e);
              throw new SequenceSpectrumRuntimeException(e);*/
        } catch (DistributionException e) {
            logger.error("Error", e);
            throw new SequenceSpectrumRuntimeException(e);
        }
    }

    /**
     * Chooses a random string according to the conditional probabilities of symbols.
     *
     * @param length the length of the desired random string
     * @return a byte[] of the desired length sampled from this distribution
     */
    public byte[] sample(final int length) throws SequenceSpectrumException {
        throw new NotImplementedException();
        /*
            byte[] result = new byte[length];
            for (int i = 0; i < length; i++)
         {
            
         }*/
    }

    /*
     public void runBeginTrainingProcessor() throws DistributionProcessorException
        {
        // do nothing
        }
        
     public void runFinishTrainingProcessor() throws DistributionProcessorException
        {
        // do nothing
        }
    */
    public void setIgnoreEdges(final boolean ignoreEdges) {
        // not relevant here...
    }

    // note that the probabilities should exist even if there are no corresponding child nodes!

    // There is no point in having a node with no associated probabilities, except temporarily

    // during the learning process before the probabilities have been filled in.

    // avoid holding a parent link; we don't need it so far

    public void setImmutable() {
        // not relevant here
    }

    /**
     * Test whether the given sequence statistics are equivalent to this one.  Differs from equals() in that
     * implementations of this interface may contain additional state which make them not strictly equal; here we're only
     * interested in whether they're equal as far as this interface is concerned.
     * <p/>
     * Naive implementations will simply test for exact equality; more sophisticated implementations ought to use a more
     * rigorous idea of "statistically equivalent", though in that case we'll probabably need to provide more parameters,
     * such as a p-value threshold to use.  Note that the spectra know the number of samples used to generate them, so at
     * least that's covered.
     *
     * @param spectrum the SequenceSpectrum to compare
     * @return True if the spectra are equivalent, false otherwise
     */
    public boolean spectrumEquals(final SequenceSpectrum spectrum) {
        throw new NotImplementedException();
    }

    // -------------------------- OTHER METHODS --------------------------

    /**
     * Gets the child node associated with the given sequence, creating it (and nodes along the way) as needed
     *
     * @param prefix the sequence of symbols to follow from this node
     * @return the node at the end of the chain of transitions
     */
    public MarkovTreeNode add(final byte[] prefix) throws SequenceSpectrumException {
        if (prefix.length == 0) {
            // this should probably never occur
            return this;
        } else if (prefix.length == 1) {
            return addChild(prefix[0]);
        } else if (prefix.length >= 1) {
            return addChild(prefix[0]).add(DSArrayUtils.suffix(prefix, 1));
        }
        throw new Error("Impossible");
    }

    /**
     * Gets the child node associated with the given symbol, creating it first if needed.
     *
     * @param sigma the transition to follow from this node
     * @return the node at the other end of the transition
     */
    public MarkovTreeNode addChild(final byte sigma) {
        leaf = false;
        final int index = DSArrayUtils.indexOf(alphabet, sigma);
        MarkovTreeNode result = children[index];
        if (result == null) {
            result = new MarkovTreeNode(DSArrayUtils.append(id, sigma), alphabet);
            children[index] = result;
        }
        return result;
    }

    public void addPseudocounts() {
        throw new NotImplementedException();
    }

    public double conditionalProbability(final byte sigma) throws SequenceSpectrumException {
        try {
            return probs.get(sigma);
        } catch (DistributionException e) {
            logger.error("Error", e);
            throw new SequenceSpectrumException(e);
        }
    }

    public double conditionalProbability(final byte[] suffix, final byte[] prefix)
            throws SequenceSpectrumException {
        if (suffix.length == 0) {
            return 1;
        }
        final byte sigma = suffix[0];
        if (suffix.length == 1) {
            return conditionalProbability(sigma, prefix);
        }
        return conditionalProbability(sigma, prefix)
                * conditionalProbability(DSArrayUtils.suffix(suffix, 1), DSArrayUtils.append(prefix, sigma));
    }

    public double conditionalProbabilityByAlphabetIndex(final int c) {
        try {
            return probs.get(alphabet[c]);
        } catch (DistributionException e) {
            logger.error("Error", e);
            throw new Error(e);
        }
    }

    /**
     * Recursively fills out the tree so that each node has a complete set of children, or none at all.  That is: if this
     * node has any children at all, ensures that there is a child for each symbol in the alphabet.
     */
    /*   public void complete()
     {
     if (children == null || children.size() == 0)
        {
        return;
        }
        
     // we only get here if there is at least one child.  In that case, the get() calls will create all the other children, as needed.
     for (byte sigma : getAlphabet())
        {
        add(sigma).complete();
        }
     }*/

    /**
     * Recursively fills out the tree so that each node has either a complete set of children or none at all, and assigns
     * conditional probabilities according to the given spectrum.  That is: if this node has any children, ensures that
     * there is a child for each symbol in the alphabet.
     */
    public void copyProbsFromSpectrumRecursively(final SequenceSpectrum spectrum)
    //throws SequenceSpectrumException//DistributionException,
    {
        for (final byte sigma : getAlphabet()) {
            // copy the probabilities regardless
            double prob = 0;
            try {
                prob = spectrum.conditionalProbability(sigma, id);
            } catch (SequenceSpectrumException e) {
                //logger.error("Error", e);
                // no worries, just let prob=0 then
            }
            try {
                setProb(sigma, prob);
            } catch (DistributionException e) {
                throw new SequenceSpectrumRuntimeException(e);
            }

            // Nooo!  We already made sure that we have probability entries for each possible child.
            // But we only want a full-fledged node if the child itself has children.
            /*
                
                    // if there are any children, make sure there are all children
                    if (children != null && children.size() != 0)
                       {
                       add(sigma).copyProbsFrom(spectrum);
                       }
                
                       */
        }
        try {
            probs.normalize();
        } catch (DistributionException e) {
            throw new SequenceSpectrumRuntimeException(e);
        }

        // okay, but we still need to recurse to the children that do exist
        for (final MarkovTreeNode child : children)//.values())
        {
            if (child != null) {
                child.copyProbsFromSpectrumRecursively(spectrum);
            }
        }
    }

    private void setProb(final byte b, final double prob) throws DistributionException {
        probs.put(b, prob);
    }

    public int countChildren() {
        int result = 0;
        for (final MarkovTreeNode n : children) {
            if (n != null) {
                result++;
            }
        }
        return result;
    }

    protected void diagnostics() {
        for (final MarkovTreeNode node : getAllDownstreamNodes()) {
            total++;
            if (node.isLeaf()) {
                leaves++;
                final int depth = node.getIdBytes().length;//length();
                avgdepth += depth;
                maxdepth = Math.max(maxdepth, depth);
            }
        }
        maxdepth -= getIdBytes().length; // the symbols upstream of this node don't count
        maxdepth += 1;
        avgdepth /= leaves;
        avgdepth += 1;
        //   logger.info("Learned Ron PST using params " + branchAbsoluteMin + " " + branchConditionalMin + " " + pRatioMinMax
        //         + " " + l_max);
        logger.debug("Learned Ron PSA with " + total + " nodes, " + leaves + " leaves, avg depth " + avgdepth
                + ", max depth " + maxdepth);
        if (logger.isTraceEnabled()) {
            logger.trace("\n" + toLongString());
        }
    }

    /**
     * Gets a List<RonPSTNode> of all nodes in the subtree starting from this node, including this node.
     *
     * @return a List of all nodes in the subtree starting from this node, including this node.
     */
    public List<MarkovTreeNode> getAllDownstreamNodes() {
        final List<MarkovTreeNode> result = new ArrayList<MarkovTreeNode>();

        collectDownstreamNodes(result);
        return result;
    }

    /**
     * Recursively append this node and all its children (upstream nodes) to the provided List<RonPSTNode>
     *
     * @param nodeList a List to which to add this node and its children
     */
    private void collectDownstreamNodes(final List<MarkovTreeNode> nodeList) {
        nodeList.add(this);
        for (final MarkovTreeNode n : children) {
            if (n != null) {
                n.collectDownstreamNodes(nodeList);
            }
        }
    }

    public byte[] getIdBytes() {
        return id;
    }

    public String toLongString() {
        final StringBuffer sb = new StringBuffer();
        final Formatter formatter = new Formatter(sb, Locale.US);
        appendString(formatter, "");
        return sb.toString();
    }

    public void appendString(final Formatter formatter, final String indent) {
        for (int i = 0; i < alphabet.length; i++) {
            final byte b = alphabet[i];
            try {
                formatter.format("%s %3.3g -> %c\n", indent, probs.get(b), b);
                //append(indent + probs.get(b) + " -> " + (char)b.byteValue() + "\n");
                final MarkovTreeNode child = children[i];
                if (child != null && child.getId().length() > getId().length()) {
                    child.appendString(formatter, indent + "     | ");
                }
            } catch (DistributionException e) {
                //sb.append(indent + "ERROR ->" + b);
                formatter.format("%s %s -> %c\n", indent, "ERROR", b);
            }
        }
    }

    /**
     * Computes the total log probability of generating the given sequence fragment under the model.  This differs from
     * {@link #totalProbability(byte[])} in that the sequence fragment is not given explicitly but only as metadata.  Thus
     * its probability may be computed from summary statistics that are already available in the given SequenceFragment
     * rather than from the raw sequence.  Also, because these probabilities are typically very small, the result is
     * returned in log space (indeed implementations will likely compute them in log space).
     *
     * @param sequenceFragment the SequenceFragment whose probability is to be computed
     * @return the natural logarithm of the conditional probability (a double value between 0 and 1, inclusive)
     */
    public double fragmentLogProbabilityFromIntKcount(final SequenceFragment sequenceFragment)
            throws SequenceSpectrumException {
        final IntKcount kc = (IntKcount) sequenceFragment.getSpectrum(IntKcount.class, null);
        //int k = kc.getK();

        double result = 0;

        for (final byte[] firstWord : sequenceFragment.getFirstWords(kc.getK())) {
            result += MathUtils.approximateLog(totalProbability(firstWord));
        }

        /*   try
           {
           byte[] prefix = sequenceFragment.getPrefix(kc.getK());
           result += MathUtils.approximateLog(totalProbability(prefix));
           }
        catch (NotEnoughSequenceException e)
           {
           logger.error("Error", e);
           throw new SequenceSpectrumException(e);
           }*/

        final int[] counts = kc.getArray();

        for (int id = 0; id < counts.length; id++) {
            if (counts[id] > 0) {
                final byte[] prefix = kc.prefixForId(id);
                final byte sigma = kc.lastSymbolForId(id);

                result += counts[id] * MathUtils.approximateLog(conditionalProbability(sigma, prefix));
            }
        }

        return result;
    }

    /**
     * Computes the probability of generating the given sequence under the model.
     *
     * @param s a byte array
     * @return the probability, a double value between 0 and 1, inclusive
     */
    public double totalProbability(final byte[] s) throws SequenceSpectrumException {
        if (s.length == 0) {
            return 1;
        }
        return conditionalProbability(s, DSArrayUtils.EMPTY_BYTE_ARRAY);
    }

    public double conditionalProbability(final byte sigma, final byte[] prefix) throws SequenceSpectrumException {
        final MarkovTreeNode node = get(prefix);
        if (node == null || node.probs == null) {
            throw new SequenceSpectrumException("Unknown probability: " + Arrays.toString(prefix) + " -> " + sigma);
        }
        try {
            return node.probs.get(sigma);
        } catch (DistributionException e) {
            logger.error("Error", e);
            throw new SequenceSpectrumException(e);
        }
    }

    /**
     * Gets the child node associated with the given sequence, if it exists
     *
     * @param seq the sequence of symbols to follow from this node
     * @return the node at the end of the chain of transitions, or null if that leaf does not exist
     */
    public MarkovTreeNode get(final byte[] seq) throws SequenceSpectrumException {
        if (seq.length == 0) {
            // this should probably never occur
            return this;
        }

        final MarkovTreeNode nextChild = children[DSArrayUtils.indexOf(alphabet, seq[0])];

        if (nextChild == null) {
            return null;
        }
        if (seq.length == 1) {
            return nextChild;
        } else if (seq.length >= 1) {
            return nextChild.get(DSArrayUtils.suffix(seq, 1));
        }
        throw new Error("Impossible");
    }

    public double getAvgDepth() {
        return avgdepth;
    }

    /**
     * gets the node associated with the the given sequence, or null if that node does not exist
     *
     * @param descendantId the sequence to walk
     * @return the MarkovTreeNode
     */
    public MarkovTreeNode getDescendant(final byte[] descendantId)//throws SequenceSpectrumException
    {
        MarkovTreeNode result = this;

        // this could also have been recursive, but that would have involved making each suffix byte[] explicitly
        for (final byte b : descendantId) {
            MarkovTreeNode next = result.getChild(b);
            if (next == null) {
                return null;
            } else {
                result = next;
            }
        }

        return result;
    }

    public String getExclusiveLabel() {
        return label;
    }

    public List<byte[]> getFirstWords(final int k) {
        throw new NotImplementedException();
    }

    /**
     * gets the node associated with the longest available prefix of the given sequence.
     *
     * @param prefix the sequence to walk
     * @return the MarkovTreeNode
     */
    public MarkovTreeNode getLongestPrefix(final byte[] prefix) {
        MarkovTreeNode result = this;

        // this could also have been recursive, but that would have involved making each suffix byte[] explicitly
        for (final byte b : prefix) {
            MarkovTreeNode next = result.getChild(b);
            if (next == null) {
                return result;
            } else {
                result = next;
            }
        }

        return result;
    }

    public int getNumLeaves() {
        return leaves;
    }

    public int getNumNodes() {
        return total;
    }

    public String getSourceId() {
        throw new NotImplementedException();
    }

    public int getSubtreeNodes() {
        int result = 1;// this
        for (final MarkovTreeNode child : children)//.values())
        {
            if (child != null) {
                result += child.getSubtreeNodes();
            }
        }
        return result;
    }

    /**
     * Returns the conditional probability of the given symbol from this node.  {@link #updateLogProbs()} must have been
     * run first.
     *
     * @param sigma
     * @return
     * @throws SequenceSpectrumException
     */
    public double logConditionalProbability(final byte sigma) throws SequenceSpectrumException {
        try {
            return logprobs[DSArrayUtils.indexOf(alphabet, sigma)];//alphabetIndexForSymbol(sigma)];
        } catch (IndexOutOfBoundsException e) {
            throw new SequenceSpectrumException("Symbol " + (char) sigma + " not in alphabet");
        }
    }

    public double logConditionalProbabilityByAlphabetIndex(final int c) {
        return logprobs[c];
    }

    /*
    public Set<MarkovTreeNode> getAllNodes()
       {
       Set<MarkovTreeNode> result = new HashSet<MarkovTreeNode>();
       result.add(this);
       for (MarkovTreeNode child : children.values())
     {
     child.addAllNodes(result);
     }
       return result;
       }
        
    public void addAllNodes(Set<MarkovTreeNode> result)
       {
       result.add(this);
       for (MarkovTreeNode child : children.values())
     {
     child.addAllNodes(result);
     }
       }*/

    private void normalize() throws DistributionException {
        probs.normalize();
    }

    public void updateLogProbsRecursive() {
        updateLogProbs();
        for (final MarkovTreeNode child : children)//.values())
        {
            if (child != null) {
                child.updateLogProbsRecursive();
            }
        }
    }

    public void updateLogProbs() {
        try {
            for (int i = 0; i < alphabet.length; i++) {
                logprobs[i] = MathUtils.approximateLog(probs.get(alphabet[i]));
            }
        } catch (DistributionException e) {
            logger.error("Error", e);
            throw new SequenceSpectrumRuntimeException(e);
        }
    }
}