org.barcelonamedia.uima.CAS2WekaInstance.java Source code

Java tutorial

Introduction

Here is the source code for org.barcelonamedia.uima.CAS2WekaInstance.java

Source

/*
 * Copyright: (c) 2004-2006 Mayo Foundation for Medical Education and
 * Research (MFMER).  All rights reserved.  MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, the trade names, 
 * trademarks, service marks, or product names of the copyright holder shall
 * not be used in advertising, promotion or otherwise in connection with
 * this Software without prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 *       http://www.eclipse.org/legal/epl-v10.html
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.barcelonamedia.uima;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.List;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instances;
import weka.core.SparseInstance;

import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.barcelonamedia.uima.types.AttributeValue;
import org.barcelonamedia.uima.types.DateAttributeValue;
import org.barcelonamedia.uima.types.NominalAttributeValue;
import org.barcelonamedia.uima.types.NumericAttributeValue;
import org.barcelonamedia.uima.types.StringAttributeValue;

/** 
 * @author Philip Ogren
 */

public class CAS2WekaInstance {
    /**
     * This method creates a weka data Instance based on the AttributeValue annotations
     * that are in the passed in CAS.  The AttributeValue annotations must also have corresponding
     * attributes defined in the passed Weka Instances object.  
     * @param cas
     * @param wekaInstances this should be instantiated using an ARFF Header file generated by ARFFHeaderFileCasConsumer
     * @return a Weka instance populated with features that corresponds to the cas's AttributeValue annotations.
     * @throws CASException
     * @see edu.mayo.bmi.uima.weka.cc.ARFFHeaderFileCasConsumer
     */

    public static DenseInstance toWekaInstance(CAS cas, Instances wekaInstances) throws CASException {
        JCas jcas;
        jcas = cas.getJCas();

        JFSIndexRepository indexes = jcas.getJFSIndexRepository();
        FSIndex<Annotation> fsIndex = indexes.getAnnotationIndex(AttributeValue.type);
        FSIterator<Annotation> attributeIterator = fsIndex.iterator();
        List<AttributeValue> attributeValues = new ArrayList<AttributeValue>();

        while (attributeIterator.hasNext()) {
            AttributeValue attributeValue = (AttributeValue) attributeIterator.next();
            attributeValues.add(attributeValue);
        }

        return toWekaInternalInstance(attributeValues, wekaInstances);
    }

    /**
     * This method creates a weka data Instance based on the AttributeValue annotations
     * that are in the passed in CAS but constrained to a given span.  The AttributeValue annotations must also have corresponding
     * attributes defined in the passed Weka Instances object.  
     * @param cas
     * @param wekaInstances this should be instantiated using an ARFF Header file generated by ARFFHeaderFileCasConsumer
     * @param begin Span begin
     * @param end   Span End
     * @return a Weka instance populated with features that corresponds to the cas's AttributeValue annotations.
     * @throws CASException
     * @see edu.mayo.bmi.uima.weka.cc.ARFFHeaderFileCasConsumer
     */

    public static DenseInstance toWekaInstance(CAS cas, Instances wekaInstances, int begin, int end)
            throws CASException {
        JCas jcas;
        jcas = cas.getJCas();

        JFSIndexRepository indexes = jcas.getJFSIndexRepository();
        FSIndex<Annotation> fsIndex = indexes.getAnnotationIndex(AttributeValue.type);
        FSIterator<Annotation> attributeIterator = fsIndex.iterator();
        List<AttributeValue> attributeValues = new ArrayList<AttributeValue>();

        while (attributeIterator.hasNext()) {
            AttributeValue attributeValue = (AttributeValue) attributeIterator.next();
            if (attributeValue.getBegin() >= begin && attributeValue.getEnd() <= end)
                attributeValues.add(attributeValue);
        }

        return toWekaInternalInstance(attributeValues, wekaInstances);
    }

    private static DenseInstance toWekaInternalInstance(List<AttributeValue> attributeValues,
            Instances wekaInstances) throws CASException {
        double[] zeroValues = new double[wekaInstances.numAttributes()];
        Arrays.fill(zeroValues, 0.0d);
        DenseInstance wekaInstance = new DenseInstance(1.0d, zeroValues);
        wekaInstance.setDataset(wekaInstances);

        Iterator<AttributeValue> attributeValuesIterator = attributeValues.iterator();

        while (attributeValuesIterator.hasNext()) {
            String value = null;
            String attributeName = null;

            AttributeValue attributeValue = attributeValuesIterator.next();
            attributeName = attributeValue.getAttributeName();
            Attribute attribute = wekaInstances.attribute(attributeName);
            if (attribute == null)
                continue;

            if (attributeValue instanceof NumericAttributeValue) {
                value = ((NumericAttributeValue) attributeValue).getValue();
                wekaInstance.setValue(attribute, Double.parseDouble(value));
            } else if (attributeValue instanceof DateAttributeValue) {
                //this isn't actually very smart.... I need to understand this better
                //any volunteers for the four lines of code I need here?
                value = ((DateAttributeValue) attributeValue).getValue();
                wekaInstance.setValue(attribute, value);
            } else if (attributeValue instanceof NominalAttributeValue) {
                value = ((NominalAttributeValue) attributeValue).getValue();
                int valueIndex = attribute.indexOfValue(value);
                wekaInstance.setValue(attribute, (double) valueIndex);
            } else if (attributeValue instanceof StringAttributeValue) {
                value = ((StringAttributeValue) attributeValue).getValue();
                wekaInstance.setValue(attribute, value);
            }
        }

        Enumeration attributes = wekaInstances.enumerateAttributes();
        while (attributes.hasMoreElements()) {
            Attribute attribute = (Attribute) attributes.nextElement();
            if (attribute.isNumeric() && wekaInstance.isMissing(attribute)) {
                wekaInstance.setValue(attribute, 0);
            }
        }

        return wekaInstance;
    }

}