au.org.ala.delta.key.directives.io.KeyItemsFileReader.java Source code

Java tutorial

Introduction

Here is the source code for au.org.ala.delta.key.directives.io.KeyItemsFileReader.java

Source

/*******************************************************************************
 * Copyright (C) 2011 Atlas of Living Australia
 * All Rights Reserved.
 * 
 * The contents of this file are subject to the Mozilla Public
 * License Version 1.1 (the "License"); you may not use this file
 * except in compliance with the License. You may obtain a copy of
 * the License at http://www.mozilla.org/MPL/
 * 
 * Software distributed under the License is distributed on an "AS
 * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * rights and limitations under the License.
 ******************************************************************************/
package au.org.ala.delta.key.directives.io;

import au.org.ala.delta.DeltaContext.HeadingType;
import au.org.ala.delta.directives.validation.DirectiveException;
import au.org.ala.delta.io.BinaryKeyFile;
import au.org.ala.delta.key.ItemsFileHeader;
import au.org.ala.delta.key.KeyContext;
import au.org.ala.delta.model.Character;
import au.org.ala.delta.model.CharacterDependency;
import au.org.ala.delta.model.Item;
import au.org.ala.delta.model.MultiStateAttribute;
import au.org.ala.delta.model.MultiStateCharacter;
import au.org.ala.delta.model.MutableDeltaDataSet;
import au.org.ala.delta.model.impl.ControllingInfo;
import au.org.ala.delta.util.Utils;
import org.apache.commons.lang.ArrayUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.lang.math.IntRange;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

public class KeyItemsFileReader {

    // Attribute data is stored as 32 bit words
    private static int ATTRIBUTE_DATA_LENGTH = 32 / Byte.SIZE;

    public static final int INAPPLICABLE_BIT = 20;

    private KeyContext _context;
    private ItemsFileHeader _header;
    private MutableDeltaDataSet _dataset;
    private BinaryKeyFile _keyItemsFile;

    public KeyItemsFileReader(KeyContext context, MutableDeltaDataSet dataset, BinaryKeyFile keyItemsFile) {
        _context = context;
        _dataset = dataset;
        _keyItemsFile = keyItemsFile;

        _header = new ItemsFileHeader();
        List<Integer> headerInts = _keyItemsFile.readIntegerList(1, ItemsFileHeader.SIZE);
        _header.fromInts(headerInts);
        _context.setMaximumNumberOfItems(_header.getNumberOfItems());
        _context.setNumberOfCharacters(_header.getNumberOfCharacters());
    }

    public void readAll() {
        readHeading();
        readCharacterReliabilities();
        readCharacterDependencies();
        readItems();
        readItemAbundances();
        readIncludedCharacters();
        readIncludedItems();
    }

    public void readHeading() {
        List<Integer> headingLengthInList = _keyItemsFile.readIntegerList(_header.getHeadingRecord(), 1);
        int headingLength = headingLengthInList.get(0);
        String heading = _keyItemsFile.readString(_header.getHeadingRecord() + 1, headingLength);

        // Only use the heading from the dataset if one has not already been
        // provided in the key directives file.
        if (StringUtils.isEmpty(_context.getHeading(HeadingType.HEADING))) {
            _context.setHeading(HeadingType.HEADING, heading);
        }
    }

    public void readItems() {
        List<Integer> itemNameLengths = _keyItemsFile.readIntegerList(_header.getItemNameLengthsRecord(),
                _header.getNumberOfItems());

        int currentRecord = 2;
        for (int i = 0; i < _header.getNumberOfItems(); i++) {
            _keyItemsFile.seekToRecord(currentRecord);

            Item item = _dataset.addItem();

            int itemNameLength = itemNameLengths.get(i);
            String itemName = _keyItemsFile.readString(currentRecord, itemNameLength);

            item.setDescription(itemName);

            currentRecord += recordsSpannedByBytes(itemNameLength);
            _keyItemsFile.seekToRecord(currentRecord);

            byte[] allAttributesData = _keyItemsFile.read(_header.getNumberOfCharacters() * ATTRIBUTE_DATA_LENGTH);

            processItemAttributes(item, allAttributesData);

            currentRecord += recordsSpannedByBytes(allAttributesData.length);
        }

        _context.setMaximumNumberOfItems(_dataset.getMaximumNumberOfItems());
    }

    public void readItemAbundances() {
        List<Float> itemAbundances = _keyItemsFile.readFloatList(_header.getItemAbundancesRecord(),
                _header.getNumberOfItems());

        for (int i = 0; i < _header.getNumberOfItems(); i++) {
            int itemNumber = i + 1;
            float abundance = itemAbundances.get(i);

            // Item abundancy may have already been set in the directives file
            if (!_context.itemAbundancySet(itemNumber)) {
                _context.addItemAbundancy(itemNumber, abundance);
            }
        }
    }

    private void processItemAttributes(Item item, byte[] allAttributesData) {

        Set<Integer> variableCharacters = _context.getVariableCharactersForTaxon(item.getItemNumber());

        // First pass, read attribute data from file, or set values specified by
        // TREAT CHARACTERS AS VARIABLE directive
        for (int i = 0; i < _header.getNumberOfCharacters(); i++) {
            int charNum = i + 1;
            MultiStateCharacter ch = (MultiStateCharacter) _dataset.getCharacter(charNum);
            MultiStateAttribute msAttr = (MultiStateAttribute) _dataset.addAttribute(item.getItemNumber(),
                    ch.getCharacterId());

            List<String> presentStatesAsStrings = new ArrayList<String>();

            if (variableCharacters != null && variableCharacters.contains(charNum)) {
                // Character is variable so all states are present
                for (int j = 0; j < ch.getNumberOfStates(); j++) {
                    int stateNumber = j + 1;
                    presentStatesAsStrings.add(Integer.toString(stateNumber));
                }
            } else {
                int arrayOffset = i * ATTRIBUTE_DATA_LENGTH;
                byte[] attributeDataAsBytes = ArrayUtils.subarray(allAttributesData, arrayOffset,
                        arrayOffset + ATTRIBUTE_DATA_LENGTH);

                boolean[] attributeDataAsBooleans = Utils.byteArrayToBooleanArray(attributeDataAsBytes);

                // Pull out which states are present from the attribute data.
                // Attribute data is stored as 32 bit words, so 32 is the
                // maximum number
                // of available states, regardless of how many states the
                // character actually has.
                for (int j = 0; j < Math.min(ch.getNumberOfStates(), 32); j++) {
                    int stateNumber = j + 1;
                    if (attributeDataAsBooleans[j]) {
                        presentStatesAsStrings.add(Integer.toString(stateNumber));
                    }
                }

                boolean inapplicable = attributeDataAsBooleans[INAPPLICABLE_BIT];

                if (inapplicable) {
                    presentStatesAsStrings.add("-");
                }
            }

            // TODO bit of a hack here, as DefaultAttributeData can
            // currently
            // only be set using a String.
            // Will be able to fix this up when we switch to using a
            // SlotFile
            // based dataset.
            try {
                msAttr.setValueFromString(StringUtils.join(presentStatesAsStrings, "/"));
            } catch (DirectiveException e) {
                throw new RuntimeException(e);
            }
        }

        // Second pass, set inapplicable any dependent characters that are made
        // inapplicable by the values of their
        // controlling characters
        // First pass, read attribute data from file
        for (int i = 0; i < _header.getNumberOfCharacters(); i++) {
            MultiStateCharacter ch = (MultiStateCharacter) _dataset.getCharacter(i + 1);

            MultiStateAttribute msAttr = (MultiStateAttribute) _dataset.getAttribute(item.getItemNumber(),
                    ch.getCharacterId());

            if (!msAttr.isInapplicable()) {
                ControllingInfo controllingInfo = _dataset.checkApplicability(ch, item);
                // if (controllingInfo.isStrictlyInapplicable() ||
                // (controllingInfo.isMaybeInapplicable() &&
                // !msAttr.isUnknown())) {
                if (controllingInfo.isInapplicable()) {

                    // Maintain existing state information - any attribute with
                    // state information already set becomes
                    // "maybe inapplicable"
                    List<Integer> presentStates = msAttr.getPresentStatesAsList();
                    List<String> presentStatesAsStrings = new ArrayList<String>();
                    for (int stateNum : presentStates) {
                        presentStatesAsStrings.add(Integer.toString(stateNum));
                    }

                    // add inapplicable value
                    presentStatesAsStrings.add("-");

                    // TODO bit of a hack here, as DefaultAttributeData can
                    // currently
                    // only be set using a String.
                    // Will be able to fix this up when we switch to using a
                    // SlotFile
                    // based dataset.
                    try {
                        msAttr.setValueFromString(StringUtils.join(presentStatesAsStrings, "/"));
                    } catch (DirectiveException e) {
                        throw new RuntimeException(e);
                    }
                }
            }
        }

        // Third pass, unknown values should be variable unless the
        // TREAT UNKNOWN AS INAPPLICABLE directive has been used
        if (!_context.getTreatUnknownAsInapplicable()) {
            for (int i = 0; i < _header.getNumberOfCharacters(); i++) {
                MultiStateCharacter ch = (MultiStateCharacter) _dataset.getCharacter(i + 1);

                MultiStateAttribute msAttr = (MultiStateAttribute) _dataset.getAttribute(item.getItemNumber(),
                        ch.getCharacterId());

                if (msAttr.getPresentStates().isEmpty() && !msAttr.isInapplicable()) {
                    // Set attribute as variable - all states are present
                    List<String> presentStatesAsStrings = new ArrayList<String>();
                    for (int j = 0; j < ch.getNumberOfStates(); j++) {
                        int stateNumber = j + 1;
                        presentStatesAsStrings.add(Integer.toString(stateNumber));
                    }

                    // TODO bit of a hack here, as DefaultAttributeData can
                    // currently
                    // only be set using a String.
                    // Will be able to fix this up when we switch to using a
                    // SlotFile
                    // based dataset.
                    try {
                        msAttr.setValueFromString(StringUtils.join(presentStatesAsStrings, "/"));
                    } catch (DirectiveException e) {
                        throw new RuntimeException(e);
                    }
                }
            }
        }
    }

    public void readCharacterDependencies() {
        if (_header.getCharacterDependenciesLength() > _header.getNumberOfCharacters()) {
            List<Integer> characterDependencyData = _keyItemsFile.readIntegerList(
                    _header.getCharacterDependencyRecord(), _header.getCharacterDependenciesLength());

            List<Integer> dependencyInfoPointers = characterDependencyData.subList(0,
                    _header.getNumberOfCharacters());

            // At the start of the dependency data there is an integer value for
            // each character.
            // If non zero, the value is an offset further down the list where
            // its dependency data is.
            // Otherwise the character does not have any dependent characters.
            for (int i = 0; i < _header.getNumberOfCharacters(); i++) {
                int charDepIndex = dependencyInfoPointers.get(i);
                if (charDepIndex > 0) {

                    MultiStateCharacter controllingChar = (MultiStateCharacter) _dataset.getCharacter(i + 1);

                    int numStates = controllingChar.getStates().length;

                    // The dependency data for each character consists of one
                    // integer for each of the character's states. If the
                    // integer
                    // value listed for a state is non-zero, the value is an
                    // offset pointing to further down the list where
                    // the state's dependency data is.
                    int stateDepIndiciesStart = charDepIndex - 1;
                    int stateDepIndiciesEnd = charDepIndex - 1 + numStates;
                    List<Integer> stateDepRecordIndicies = characterDependencyData.subList(stateDepIndiciesStart,
                            stateDepIndiciesEnd);

                    // We need to coalesce the dependency data so that we have
                    // one CharacterDependency object per
                    // controlling character and set of states that make a set
                    // of dependent characters inapplicable.
                    // Use this map to keep track of the state ids that make the
                    // same set of dependent characters
                    // inapplicable.
                    Map<Set<Integer>, Set<Integer>> depCharsToStateIds = new HashMap<Set<Integer>, Set<Integer>>();

                    for (int j = 0; j < numStates; j++) {
                        Integer stateId = j + 1;

                        int stateDepRecordIndex = stateDepRecordIndicies.get(j);

                        if (stateDepRecordIndex > 0) {
                            // First value listed in the state's dependency data
                            // is the number of character ranges dependent on
                            // that state.
                            int numDependentCharRanges = characterDependencyData.get(stateDepRecordIndex - 1);

                            // Immediately after the range information is listed
                            // - the upper and lower bound is listed for each
                            // range.
                            List<Integer> rangeNumbers = characterDependencyData.subList(stateDepRecordIndex,
                                    stateDepRecordIndex + (numDependentCharRanges * 2));

                            Set<Integer> dependentChars = new HashSet<Integer>();

                            for (int k = 0; k < numDependentCharRanges * 2; k = k + 2) {
                                int lowerBound = rangeNumbers.get(k);
                                int upperBound = rangeNumbers.get(k + 1);

                                IntRange r = new IntRange(lowerBound, upperBound);

                                for (int dependentChar : r.toArray()) {
                                    dependentChars.add(dependentChar);
                                }
                            }

                            if (depCharsToStateIds.containsKey(dependentChars)) {
                                Set<Integer> stateSet = depCharsToStateIds.get(dependentChars);
                                stateSet.add(stateId);
                            } else {
                                Set<Integer> stateSet = new HashSet<Integer>();
                                stateSet.add(stateId);
                                depCharsToStateIds.put(dependentChars, stateSet);
                            }
                        }
                    }

                    // Now that we have coalesced the dependency data into the
                    // form we need, we can
                    // create the CharacterDependency objects.
                    for (Set<Integer> depCharsSet : depCharsToStateIds.keySet()) {
                        Set<Integer> stateSet = depCharsToStateIds.get(depCharsSet);
                        CharacterDependency charDep = _dataset.addCharacterDependency(controllingChar, stateSet,
                                depCharsSet);
                    }
                }
            }
        }
    }

    public void readCharacterReliabilities() {
        List<Float> reliabilities = _keyItemsFile.readFloatList(_header.getCharcterReliabilitiesRecord(),
                _header.getNumberOfCharacters());
        int numberOfCharacters = _header.getNumberOfCharacters();

        for (int i = 0; i < numberOfCharacters; i++) {
            Character ch = _dataset.getCharacter(i + 1);
            ch.setReliability(reliabilities.get(i));
        }
    }

    public void readIncludedCharacters() {
        List<Integer> characterMask = _keyItemsFile.readIntegerList(_header.getCharacterMaskRecord(),
                _header.getNumberOfCharacters());

        List<Integer> includedCharacterNumbers = new ArrayList<Integer>();
        for (int i = 0; i < _header.getNumberOfCharacters(); i++) {
            int characterNumber = i + 1;
            if (characterMask.get(i) == 0) {
                _context.excludeCharacter(characterNumber);
            }
        }
    }

    public void readIncludedItems() {
        List<Integer> taxonMask = _keyItemsFile.readIntegerList(_header.getTaxonMaskRecord(),
                _header.getNumberOfItems());

        List<Integer> includedItemNumbers = new ArrayList<Integer>();
        for (int i = 0; i < _header.getNumberOfItems(); i++) {
            int itemNumber = i + 1;
            if (taxonMask.get(i) == 0) {
                _context.excludeItem(itemNumber);
                _context.addItemAbundancy(itemNumber, 0);
            }
        }
    }

    private int recordsSpannedByBytes(int numBytes) {
        return (int) (Math.ceil((double) numBytes / (double) BinaryKeyFile.RECORD_LENGTH_BYTES));
    }

}