Java tutorial
/* * File TreeParser.java * * Copyright (C) 2010 Remco Bouckaert remco@cs.auckland.ac.nz * * This file is part of BEAST2. * See the NOTICE file distributed with this work for additional * information regarding copyright ownership and licensing. * * BEAST is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser General Public License as * published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * BEAST is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with BEAST; if not, write to the * Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, * Boston, MA 02110-1301 USA */ package beast.util; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.BitSet; import java.util.List; import java.util.Map; import beast.util.treeparser.NewickParser; import beast.util.treeparser.NewickParser.MetaContext; import beast.util.treeparser.NewickParserBaseVisitor; import org.antlr.v4.runtime.*; import org.antlr.v4.runtime.misc.NotNull; import org.antlr.v4.runtime.tree.ParseTree; import beast.core.Description; import beast.core.Input; import beast.core.StateNode; import beast.core.StateNodeInitialiser; import beast.core.util.Log; import beast.evolution.alignment.Alignment; import beast.evolution.alignment.Taxon; import beast.evolution.alignment.TaxonSet; import beast.evolution.tree.Node; import beast.evolution.tree.Tree; import beast.evolution.tree.TreeUtils; import beast.util.treeparser.NewickLexer; @Description("Create beast.tree by parsing from a specification of a beast.tree in Newick format " + "(includes parsing of any meta data in the Newick string).") public class TreeParser extends Tree implements StateNodeInitialiser { /** * default beast.tree branch length, used when that info is not in the Newick beast.tree */ final static double DEFAULT_LENGTH = 0.001f; /** * labels of leafs, order of this list corresponds to node numbers */ List<String> labels = null; /** * for memory saving, set to true * */ final boolean suppressMetadata = false; /** * This solves issues where the taxa labels are numbers (in generated * beast.tree data for example). */ public final Input<Boolean> isLabelledNewickInput = new Input<>("IsLabelledNewick", "Is the newick tree labelled (alternatively contains node numbers)? Default=false.", false); public final Input<Alignment> dataInput = new Input<>("taxa", "Specifies the list of taxa represented by leaves in the beast.tree"); public final Input<String> newickInput = new Input<>("newick", "initial beast.tree represented in newick format");// not required, Beauti may need this for example public final Input<Integer> offsetInput = new Input<>("offset", "offset if numbers are used for taxa (offset=the lowest taxa number) default=1", 1); public final Input<Double> thresholdInput = new Input<>("threshold", "threshold under which node heights (derived from lengths) are set to zero. Default=0.", 0.0); public final Input<Boolean> allowSingleChildInput = new Input<>("singlechild", "flag to indicate that single child nodes are allowed. Default=true.", true); public final Input<Boolean> adjustTipHeightsInput = new Input<>("adjustTipHeights", "flag to indicate if tipHeights shall be adjusted when date traits missing. Default=true.", true); public final Input<Double> scaleInput = new Input<>("scale", "scale used to multiply internal node heights during parsing. Useful for importing starting from external" + " programs, for instance, RaxML tree rooted using Path-o-gen.", 1.0); public final Input<Boolean> binarizeMultifurcationsInput = new Input<>("binarizeMultifurcations", "Whether or not to turn multifurcations into sequences of bifurcations. (Default true.)", true); boolean createUnrecognizedTaxa = false; /** * Flag to indicate whether integer leaf labels have been used. If * they have, these will be used to interpret Nexus label translation maps * in place of the node numbers. */ boolean integerLeafLabels = true; /** * Ensure the class behaves properly, even when inputs are not specified. */ @Override public void initAndValidate() { boolean sortNodesAlphabetically = false; if (dataInput.get() != null) { labels = dataInput.get().getTaxaNames(); } else if (m_taxonset.get() != null) { if (labels == null) { labels = m_taxonset.get().asStringList(); } else { // else labels were set by TreeParser c'tor sortNodesAlphabetically = true; } } else { if (isLabelledNewickInput.get()) { if (m_initial.get() != null) { labels = m_initial.get().getTaxonset().asStringList(); } else { labels = new ArrayList<>(); createUnrecognizedTaxa = true; sortNodesAlphabetically = true; } } else { if (m_initial.get() != null) { // try to pick up taxa from initial tree final Tree tree = m_initial.get(); if (tree.m_taxonset.get() != null) { labels = tree.m_taxonset.get().asStringList(); } else { // m_sLabels = null; } } else { // m_sLabels = null; } } // m_bIsLabelledNewick = false; } final String newick = newickInput.get(); if (newick == null || newick.equals("")) { // can happen while initalising Beauti final Node dummy = new Node(); setRoot(dummy); } else { setRoot(parseNewick(newickInput.get())); } super.initAndValidate(); m_sTaxaNames = null; if (sortNodesAlphabetically) { // correct for node ordering: ensure order is alphabetical for (int i = 0; i < getNodeCount() && i < labels.size(); i++) { m_nodes[i].setID(labels.get(i)); } Node[] nodes = new Node[labels.size()]; System.arraycopy(m_nodes, 0, nodes, 0, labels.size()); Arrays.sort(nodes, (o1, o2) -> o1.getID().compareTo(o2.getID())); for (int i = 0; i < labels.size(); i++) { m_nodes[i] = nodes[i]; nodes[i].setNr(i); } } if (m_initial.get() != null) processTraits(m_initial.get().m_traitList.get()); else processTraits(m_traitList.get()); if (timeTraitSet != null) { adjustTreeNodeHeights(root); } else if (adjustTipHeightsInput.get()) { double treeLength = TreeUtils.getTreeLength(this, getRoot()); double extraTreeLength = 0.0; double maxTipHeight = 0.0; // all nodes should be at zero height if no date-trait is available for (int i = 0; i < getLeafNodeCount(); i++) { double height = getNode(i).getHeight(); if (maxTipHeight < height) { maxTipHeight = height; } extraTreeLength += height; getNode(i).setHeight(0); } double scaleFactor = (treeLength + extraTreeLength) / treeLength; final double SCALE_FACTOR_THRESHOLD = 0.001; // if the change in total tree length is more than 0.1% then give the user a warning! if (scaleFactor > 1.0 + SCALE_FACTOR_THRESHOLD) { DecimalFormat format = new DecimalFormat("#.##"); Log.info.println( "WARNING: Adjust tip heights attribute set to 'true' in " + getClass().getSimpleName()); Log.info.println(" has resulted in significant (>" + format.format(SCALE_FACTOR_THRESHOLD * 100.0) + "%) change in tree length."); Log.info.println( " Use " + adjustTipHeightsInput.getName() + "='false' to override this default."); Log.info.printf(" original max tip age = %8.3f\n", maxTipHeight); Log.info.printf(" new max tip age = %8.3f\n", 0.0); Log.info.printf(" original tree length = %8.3f\n", treeLength); Log.info.printf(" new tree length = %8.3f\n", treeLength + extraTreeLength); Log.info.printf(" TL scale factor = %8.3f\n", scaleFactor); } } if (m_taxonset.get() == null && labels != null && isLabelledNewickInput.get()) { m_taxonset.setValue(new TaxonSet(Taxon.createTaxonList(labels)), this); } initStateNodes(); } // init public TreeParser() { } public TreeParser(final Alignment alignment, final String newick) { dataInput.setValue(alignment, this); newickInput.setValue(newick, this); initAndValidate(); } /** * Create a tree from the given newick format * * @param taxaNames a list of taxa names to use, or null. * If null then IsLabelledNewick will be set to true * @param newick the newick of the tree * @param offset the offset to map node numbers in newick format to indices in taxaNames. * so, name(node with nodeNumber) = taxaNames[nodeNumber-offset] * @param adjustTipHeightsWhenMissingDateTraits * true if tip heights should be adjusted to zero */ public TreeParser(final List<String> taxaNames, final String newick, final int offset, final boolean adjustTipHeightsWhenMissingDateTraits) { if (taxaNames == null) { isLabelledNewickInput.setValue(true, this); } else { m_taxonset.setValue(new TaxonSet(Taxon.createTaxonList(taxaNames)), this); } newickInput.setValue(newick, this); offsetInput.setValue(offset, this); adjustTipHeightsInput.setValue(adjustTipHeightsWhenMissingDateTraits, this); labels = taxaNames; initAndValidate(); } /** * Parses newick format. The default does not adjust heights and allows single child nodes. * Modifications of the input should be deliberately made by calling e.g. new TreeParser(newick, true, false). * * @param newick a string representing a tree in newick format */ public TreeParser(final String newick) { this(newick, false, true, true, 1); } /** * @param newick a string representing a tree in newick format * @param adjustTipHeights true if the tip heights should be adjusted to 0 (i.e. contemporaneous) after reading in tree. * @param allowSingleChildNodes true if internal nodes with single children are allowed */ public TreeParser(final String newick, final boolean adjustTipHeights, final boolean allowSingleChildNodes) { this(newick, adjustTipHeights, allowSingleChildNodes, true, 1); } /** * @param newick a string representing a tree in newick format * @param adjustTipHeights true if the tip heights should be adjusted to 0 (i.e. contemporaneous) after reading in tree. */ public TreeParser(final String newick, final boolean adjustTipHeights) { this(newick, adjustTipHeights, true, true, 1); } /** * @param newick a string representing a tree in newick format * @param adjustTipHeights true if the tip heights should be adjusted to 0 (i.e. contemporaneous) after reading in tree. * @param allowSingleChildNodes true if internal nodes with single children are allowed * @param isLabeled true if nodes are labeled with taxa labels * @param offset if isLabeled == false and node labeling starts with x * then offset should be x. When isLabeled == true offset should * be 1 as by default. */ public TreeParser(final String newick, final boolean adjustTipHeights, final boolean allowSingleChildNodes, final boolean isLabeled, final int offset) { this(newick, adjustTipHeights, allowSingleChildNodes, isLabeled, offset, true); } /** * @param newick a string representing a tree in newick format * @param adjustTipHeights true if the tip heights should be adjusted to 0 (i.e. contemporaneous) after reading in tree. * @param allowSingleChildNodes true if internal nodes with single children are allowed * @param isLabeled true if nodes are labeled with taxa labels * @param offset if isLabeled == false and node labeling starts with x * then offset should be x. When isLabeled == true offset should * be 1 as by default. * @param binarizeMultifurcations set to true to convert multifurcations to bifurcations. */ public TreeParser(final String newick, final boolean adjustTipHeights, final boolean allowSingleChildNodes, final boolean isLabeled, final int offset, final boolean binarizeMultifurcations) { newickInput.setValue(newick, this); isLabelledNewickInput.setValue(isLabeled, this); adjustTipHeightsInput.setValue(adjustTipHeights, this); allowSingleChildInput.setValue(allowSingleChildNodes, this); offsetInput.setValue(offset, this); binarizeMultifurcationsInput.setValue(binarizeMultifurcations, this); initAndValidate(); } /** * Parse a newick-ish string and generate the BEAST tree it describes. * * @param newick string to parse * @return root node of tree */ public Node parseNewick(String newick) { CharStream charStream = CharStreams.fromString(newick); // Custom parse/lexer error listener BaseErrorListener errorListener = new BaseErrorListener() { @Override public void syntaxError(Recognizer<?, ?> recognizer, Object offendingSymbol, int line, int charPositionInLine, String msg, RecognitionException e) { throw new TreeParsingException(msg, charPositionInLine, line); } }; // Use lexer to produce token stream NewickLexer lexer = new NewickLexer(charStream); lexer.removeErrorListeners(); lexer.addErrorListener(errorListener); CommonTokenStream tokens = new CommonTokenStream(lexer); // Parse token stream to produce parse tree NewickParser parser = new NewickParser(tokens); parser.removeErrorListeners(); parser.addErrorListener(errorListener); ParseTree parseTree = parser.tree(); // Traverse parse tree, constructing BEAST tree along the way NewickASTVisitor visitor = new NewickASTVisitor(); return visitor.visit(parseTree); } /** * Given a map of name translations (string to string), * rewrites all leaf ids that match a key in the map * to the respective value in the matching key/value pair. * If current leaf id is null, then interpret translation keys as node numbers (origin 1) * and set leaf id of node n to map.get(n-1). * * @param translationMap map of name translations */ public void translateLeafIds(final Map<String, String> translationMap) { for (final Node leaf : getExternalNodes()) { String id = leaf.getID(); if (id == null || !integerLeafLabels) { id = Integer.toString(leaf.getNr() + 1); } final String newId = translationMap.get(id); if (newId != null) { leaf.setID(newId); } } } /** * Visits each component of the AST built from the Newick string, constructing * a BEAST tree along the way. */ class NewickASTVisitor extends NewickParserBaseVisitor<Node> { private int numberedNodeCount = 0; @Override public Node visitTree(@NotNull NewickParser.TreeContext ctx) { Node root = visit(ctx.node()); // Ensure tree is properly sorted in terms of node numbers. root.sort(); // Replace lengths read from Newick with heights. convertLengthToHeight(root); // Make sure internal nodes are numbered correctly numberUnnumberedNodes(root); // Check for duplicate taxa BitSet nodeNrSeen = new BitSet(); for (Node leaf : root.getAllLeafNodes()) { if (leaf.getNr() < 0) continue; // Skip unnumbered leaves if (nodeNrSeen.get(leaf.getNr())) throw new TreeParsingException("Duplicate taxon found: " + labels.get(leaf.getNr())); else nodeNrSeen.set(leaf.getNr()); } return root; } private void processMetadata(Node node, MetaContext metaContext, boolean isLengthMeta) { String metaDataString = ""; for (int i = 0; i < metaContext.attrib().size(); i++) { if (i > 0) metaDataString += ","; metaDataString += metaContext.attrib().get(i).getText(); } if (isLengthMeta) node.lengthMetaDataString = metaDataString; else node.metaDataString = metaDataString; if (!suppressMetadata) { String key; Object value; for (NewickParser.AttribContext attribctx : metaContext.attrib()) { key = attribctx.attribKey.getText(); if (attribctx.attribValue().attribNumber() != null) { value = Double.parseDouble(attribctx.attribValue().attribNumber().getText()); } else if (attribctx.attribValue().ASTRING() != null) { String stringValue = attribctx.attribValue().ASTRING().getText(); if (stringValue.startsWith("\"") || stringValue.startsWith("\'")) { stringValue = stringValue.substring(1, stringValue.length() - 1); } value = stringValue; } else if (attribctx.attribValue().vector() != null) { try { List<NewickParser.AttribValueContext> elementContexts = attribctx.attribValue().vector() .attribValue(); Double[] arrayValues = new Double[elementContexts.size()]; for (int i = 0; i < elementContexts.size(); i++) arrayValues[i] = Double.parseDouble(elementContexts.get(i).getText()); value = arrayValues; } catch (NumberFormatException ex) { throw new TreeParsingException("Encountered vector-valued metadata entry with " + "one or more non-numeric elements."); } } else throw new TreeParsingException("Encountered unknown metadata value."); if (isLengthMeta) node.setLengthMetaData(key, value); else node.setMetaData(key, value); } } } /** * Use zero-length edges to replace multifurcations with a sequence of bifurcations. * * @param node node representing multifurcation */ private void binarizeMultifurcation(Node node) { if (node.getChildCount() > 2) { List<Node> children = new ArrayList<>(node.getChildren()); Node prevDummy = node; for (int i = 1; i < children.size() - 1; i++) { Node child = children.get(i); Node dummyNode = newNode(); dummyNode.setNr(-1); dummyNode.setHeight(0); prevDummy.addChild(dummyNode); node.removeChild(child); dummyNode.addChild(child); prevDummy = dummyNode; } node.removeChild(children.get(children.size() - 1)); prevDummy.addChild(children.get(children.size() - 1)); } } @Override public Node visitNode(NewickParser.NodeContext ctx) { Node node = newNode(); for (NewickParser.NodeContext ctxChild : ctx.node()) { node.addChild(visit(ctxChild)); } NewickParser.PostContext postCtx = ctx.post(); // Process metadata if (postCtx.nodeMeta != null) processMetadata(node, postCtx.nodeMeta, false); if (postCtx.lengthMeta != null) processMetadata(node, postCtx.lengthMeta, true); // Process edge length if (postCtx.length != null) node.setHeight(Double.parseDouble(postCtx.length.getText())); else node.setHeight(DEFAULT_LENGTH); // Process label node.setNr(-1); if (postCtx.label() != null) { node.setID(postCtx.label().getText()); if (postCtx.label().number() == null || postCtx.label().number().INT() == null) integerLeafLabels = false; // Treat labels as node numbers in certain situations if (!isLabelledNewickInput.get() && postCtx.label().number() != null && postCtx.label().number().INT() != null) { int nodeNr = Integer.parseInt(postCtx.label().getText()) - offsetInput.get(); if (nodeNr < 0) throw new TreeParsingException("Node number given " + "is smaller than current offset (" + offsetInput.get() + "). Perhaps offset is " + "too high?"); node.setNr(nodeNr); numberedNodeCount += 1; } else { if (node.isLeaf()) { node.setNr(getLabelIndex(postCtx.label().getText())); numberedNodeCount += 1; } } } if (node.getChildCount() == 1 && !allowSingleChildInput.get()) throw new TreeParsingException("Node with single child found."); // Use length-zero edges to binarize multifurcations. if (binarizeMultifurcationsInput.get()) binarizeMultifurcation(node); return node; } /** * Try to map str into an index. */ private int getLabelIndex(final String str) { // look it up in list of taxa for (int index = 0; index < labels.size(); index++) { if (str.equals(labels.get(index))) { return index; } } // if createUnrecognizedTaxon==true, then do it now, otherwise labels will not be populated and // out of bounds error will occur in m_sLabels later. if (createUnrecognizedTaxa) { labels.add(str); return labels.size() - 1; } throw new TreeParsingException("Label '" + str + "' in Newick beast.tree could " + "not be identified. Perhaps taxa or taxonset is not specified?"); } /** * The node height field is initially populated with the length of the edge above due * to the way the tree is stored in Newick format. This method converts these lengths * to actual ages before the most recent sample. * * @param root root of tree */ private void convertLengthToHeight(final Node root) { final double totalHeight = convertLengthToHeight(root, 0); offset(root, -totalHeight); } /** * Recursive method used to convert lengths to heights. Applied to the root, * results in heights from 0 to -total_height_of_tree. * * @param node node of a clade to convert * @param height Parent height. * @return total height of clade */ private double convertLengthToHeight(final Node node, final double height) { final double length = node.getHeight(); node.setHeight((height - length) * scaleInput.get()); if (node.isLeaf()) { return node.getHeight(); } else { double minChildHeight = Double.POSITIVE_INFINITY; for (Node child : node.getChildren()) minChildHeight = Math.min(minChildHeight, convertLengthToHeight(child, height - length)); return minChildHeight; } } /** * Method used by convertLengthToHeight(node) to remove negative offset from * node heights that is produced by convertLengthToHeight(node, height). * * @param node node of clade to offset * @param delta offset */ private void offset(final Node node, final double delta) { node.setHeight(node.getHeight() + delta); if (node.isLeaf()) { if (node.getHeight() < thresholdInput.get()) { node.setHeight(0); } } for (Node child : node.getChildren()) offset(child, delta); } /** * Number any nodes in a clade which were not explicitly numbered by * the parsed string. * * @param node clade parent */ private void numberUnnumberedNodes(Node node) { if (node.isLeaf()) return; for (Node child : node.getChildren()) { numberUnnumberedNodes(child); } if (node.getNr() < 0) node.setNr(numberedNodeCount); numberedNodeCount += 1; } } /* *StateNodeInitializer implementation */ @Override public void initStateNodes() { if (m_initial.get() != null) { m_initial.get().assignFromWithoutID(this); } } @Override public void getInitialisedStateNodes(final List<StateNode> stateNodes) { if (m_initial.get() != null) { stateNodes.add(m_initial.get()); } } public class TreeParsingException extends RuntimeException { String message; Integer characterNum, lineNum; /** * Create new parsing exception. * * @param message Human-readable error message. * @param characterNum Character offset of error. * @param lineNum Line offset of error. */ TreeParsingException(String message, Integer characterNum, Integer lineNum) { this.message = message; this.characterNum = characterNum; this.lineNum = lineNum; } /** * Create new parsing exception * * @param message Human-readable error message. */ TreeParsingException(String message) { this(message, null, null); } @Override public String getMessage() { return message; } /** * @return location of error on line. (May be null for non-lexer errors.) */ public Integer getCharacterNum() { return characterNum; } /** * @return line number offset of error. (May be null for non-lexer errors.) */ public Integer getLineNum() { return lineNum; } } }