joelib2.algo.datamining.yale.ExampleSetHelper.java Source code

Java tutorial

Introduction

Here is the source code for joelib2.algo.datamining.yale.ExampleSetHelper.java

Source

///////////////////////////////////////////////////////////////////////////////
//  Filename: $RCSfile: ExampleSetHelper.java,v $
//  Purpose:  Atom representation.
//  Language: Java
//  Compiler: JDK 1.4
//  Authors:  Joerg Kurt Wegner
//  Version:  $Revision: 1.8 $
//            $Date: 2005/02/17 16:48:28 $
//            $Author: wegner $
//
// Copyright OELIB:          OpenEye Scientific Software, Santa Fe,
//                           U.S.A., 1999,2000,2001
// Copyright JOELIB/JOELib2: Dept. Computer Architecture, University of
//                           Tuebingen, Germany, 2001,2002,2003,2004,2005
// Copyright JOELIB/JOELib2: ALTANA PHARMA AG, Konstanz, Germany,
//                           2003,2004,2005
//
//  This program is free software; you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation version 2 of the License.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
///////////////////////////////////////////////////////////////////////////////
package joelib2.algo.datamining.yale;

import joelib2.feature.NativeValue;

import joelib2.molecule.Molecule;
import joelib2.molecule.MoleculeVector;

import joelib2.molecule.types.BasicPairData;

import joelib2.process.types.DescriptorBinning;

//import joelib2.util.JOEHelper;

import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.WekaException;

import java.util.Enumeration;
import java.util.Hashtable;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

/**
 *  Atom representation.
 *
 * @.author    wegnerj
 */
public class ExampleSetHelper {
    //~ Static fields/initializers /////////////////////////////////////////////

    /**
     * Obtain a suitable logger.
     */
    private static Logger logger = LogManager.getLogger("jcompchem.joelib2.algo.yale.ExampleSetHelper");

    //~ Constructors ///////////////////////////////////////////////////////////

    public ExampleSetHelper() {
        if (logger.isDebugEnabled()) {
            logger.debug("Initialize " + this.getClass().getName());
        }
    }

    //~ Methods ////////////////////////////////////////////////////////////////

    public static Instances createMolInstances(MoleculeVector molecules, String[] attributes, int[] attributeTypes)
            throws WekaException {
        // load descriptor binning
        DescriptorBinning binning = DescriptorBinning.getDescBinning(molecules);

        int length = molecules.getSize();

        if (attributes.length != attributeTypes.length) {
            throw new WekaException("Different number of attributes and attribute types.");

            //return null;
        }

        Enumeration enumeration = binning.getDescriptors();
        FastVector attributesV = new FastVector(binning.numberOfDescriptors());
        Molecule mol;
        BasicPairData pairData;

        for (int i = 0; i < attributes.length; i++) {
            if (attributeTypes[i] == Attribute.NUMERIC) {
                // numeric
                attributesV.addElement(new Attribute((String) enumeration.nextElement(), attributesV.size()));
            } else if (attributeTypes[i] == Attribute.NOMINAL) {
                // nominal
                // create a list with all nominal values
                Hashtable hashed = new Hashtable();

                for (int j = 0; j < length; j++) {
                    mol = molecules.getMol(j);

                    // get unparsed data
                    pairData = (BasicPairData) mol.getData(attributes[i], false);

                    if (pairData != null) {
                        if (pairData.getKeyValue() instanceof String) {
                            hashed.put(pairData.getKeyValue(), "");
                        } else {
                            hashed.put(pairData.toString(), "");
                        }
                    }
                }

                // store list of nominal values in the Weka data structure
                FastVector attributeValues = new FastVector(hashed.size());
                String tmp;

                for (Enumeration e = hashed.keys(); e.hasMoreElements();) {
                    tmp = (String) e.nextElement();
                    attributeValues.addElement(tmp);

                    //System.out.println("NOMINAL " + tmp);
                }

                attributesV.addElement(new Attribute(attributes[i], attributeValues, attributesV.size()));
            }
        }

        int size = attributesV.size();
        Attribute attribute;

        // create molecule instances
        Instances instances = new Instances("MoleculeInstances", attributesV, attributesV.size());

        // iterate over all instances (to generate them)
        double[] instance;

        for (int i = 0; i < length; i++) {
            mol = molecules.getMol(i);
            instance = new double[size];

            for (int j = 0; j < size; j++) {
                attribute = (Attribute) attributesV.elementAt(j);

                // get parsed data
                pairData = (BasicPairData) mol.getData(attribute.name(), true);

                // add nominal or numeric or missing value
                if (pairData == null) {
                    instance[attribute.index()] = Instance.missingValue();
                } else {
                    if (attribute.isNominal()) {
                        // nominal
                        String tmpS = pairData.toString().trim();

                        if (tmpS.indexOf("\n") != -1) {
                            throw new WekaException("Descriptor " + attribute.name()
                                    + " contains multiple lines and is not a valid nominal value.");
                        } else {
                            instance[attribute.index()] = attribute.indexOfValue(pairData.toString());

                            if (instance[attribute.index()] == -1) {
                                // invalid nominal value
                                logger.error("Invalid nominal value.");

                                return null;
                            }
                        }
                    } else {
                        // numeric
                        if (pairData instanceof NativeValue) {
                            double tmpD = ((NativeValue) pairData).getDoubleNV();

                            if (Double.isNaN(tmpD)) {
                                instance[attribute.index()] = Instance.missingValue();
                            } else {
                                instance[attribute.index()] = tmpD;
                            }
                        } else {
                            throw new WekaException("Descriptor " + attribute.name() + " is not a native value.");
                        }
                    }
                }

                attribute.index();
            }

            // add created molecule instance to molecule instances
            instances.add(new Instance(1, instance));
        }

        return instances;
    }

    public static Instances matrix2instances(double[][] matrix, String[] descriptors, int[] attributeTypes) {
        FastVector attributesV = new FastVector(descriptors.length);
        int molecules = matrix[0].length;

        for (int i = 0; i < descriptors.length; i++) {
            if (attributeTypes[i] == Attribute.NUMERIC) {
                // numeric
                attributesV.addElement(new Attribute(descriptors[i], attributesV.size()));
            } else if (attributeTypes[i] == Attribute.NOMINAL) {
                // nominal
                // create a list with all nominal values
                Hashtable hashed = new Hashtable();

                for (int j = 0; j < molecules; j++) {
                    hashed.put(new Double(matrix[i][j]), "");
                }

                // store list of nominal values in the Weka data structure
                FastVector attributeValues = new FastVector(hashed.size());
                Double tmp;

                for (Enumeration e = hashed.keys(); e.hasMoreElements();) {
                    tmp = (Double) e.nextElement();
                    attributeValues.addElement(tmp.toString());

                    //System.out.println("NOMINAL " + tmp);
                }

                attributesV.addElement(new Attribute(descriptors[i], attributeValues, attributesV.size()));
            }
        }

        int descriptorSize = attributesV.size();
        Attribute attribute = null;

        // create molecule instances
        Instances instances = new Instances("MatrixInstances", attributesV, attributesV.size());

        // iterate over all instances (to generate them)
        double[] instance;

        for (int i = 0; i < molecules; i++) {
            instance = new double[descriptorSize];

            for (int j = 0; j < descriptorSize; j++) {
                attribute = (Attribute) attributesV.elementAt(j);

                if (Double.isNaN(matrix[j][i])) {
                    instance[attribute.index()] = Instance.missingValue();
                } else {
                    if (attributeTypes[j] == Attribute.NUMERIC) {
                        // numeric
                        instance[attribute.index()] = matrix[j][i];
                    } else if (attributeTypes[j] == Attribute.NOMINAL) {
                        // nominal
                        instance[attribute.index()] = attribute.indexOfValue(Double.toString(matrix[j][i]));

                        if (instance[attribute.index()] == -1) {
                            // invalid nominal value
                            logger.error("Invalid nominal value.");

                            return null;
                        }
                    }
                }

                attribute.index();
            }

            // add created molecule instance to molecule instances
            Instance inst = new Instance(1, instance);
            instances.add(inst);

            //System.out.println("instance (attr.:"+inst.numAttributes()+", vals:"+inst.numValues()+"): "+inst);
        }

        //System.out.println(instances.toString());
        return instances;
    }
}

///////////////////////////////////////////////////////////////////////////////
//  END OF FILE.
///////////////////////////////////////////////////////////////////////////////

///////////////////////////////////////////////////////////////////////////////
//  END OF FILE.
///////////////////////////////////////////////////////////////////////////////