org.scify.NewSumServer.Server.MachineLearning.dataSets.java Source code

Java tutorial

Introduction

Here is the source code for org.scify.NewSumServer.Server.MachineLearning.dataSets.java

Source

/*
 * Copyright 2013 SciFY NPO <info@scify.org>.
 *
 * This product is part of the NewSum Free Software.
 * For more information about NewSum visit
 * 
 *    http://www.scify.gr/site/en/our-projects/completed-projects/newsum-menu-en
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * If this code or its output is used, extended, re-engineered, integrated, 
 * or embedded to any extent in another software or hardware, there MUST be 
 * an explicit attribution to this work in the resulting source code, 
 * the packaging (where such packaging exists), or user interface 
 * (where such an interface exists). 
 * The attribution must be of the form "Powered by NewSum, SciFY"
 */

package org.scify.NewSumServer.Server.MachineLearning;

import org.scify.NewSumServer.Server.MachineLearning.vector;
import gr.demokritos.iit.jinsect.storage.INSECTDB;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instances;
import weka.core.Utils;

/**
 *
 * @author panagiotis giotis
 *
 * Includes methods which creates the train and test dataset
 *
 */
public class dataSets {

    /**
     * Generate the train dataset
     *
     * @param file the path for the InsectDB file
     * @return the train dataset as Instance
     */
    public static Instances trainingSet(INSECTDB file) {
        ArrayList<Attribute> atts;
        ArrayList<String> attVals = new ArrayList<String>();
        ArrayList<String> vectors;
        Instances data;
        double[] vals;

        atts = new ArrayList<Attribute>(); // Set up attributes

        HashSet<String> hasGnames = new HashSet<String>(); //create a HashSet with all class graph names 
        hasGnames.addAll(Arrays.asList(file.getObjectList("cg")));
        for (String index : hasGnames) { // for each class graph name add a attribute
            atts.add(new Attribute(index));
            attVals.add(index);
        }

        atts.add(new Attribute("Class", attVals)); // fill the attribute class with with given  class graph name

        data = new Instances("train Set for Category Classification ", atts, 0);//create Instances object

        vectors = vector.trainingVector(file); // take all instance vectors 

        // fill with data

        for (String vi : vectors) { // for each instance

            String[] vectorTable = vi.trim().split(",");

            vals = new double[data.numAttributes()];

            for (int i = 0; i < vectorTable.length - 2; i++) {
                vals[i] = Double.parseDouble(vectorTable[i]);

            }

            vals[vectorTable.length - 1] = attVals.indexOf(vectorTable[vectorTable.length - 1]); //Class name

            data.add(new DenseInstance(1.0, vals)); // add data to Instance

        }

        return data;
    }

    /**
     * Generate the label dataset
     *
     * @param file path for insectDB file
     * @param ClassGname The name for current class
     * @param Ivector the similarity vector between given mail and all class
     * graphs
     * @return the label dataset as instance
     */
    public static Instances labelingSet(INSECTDB file, String Ivector) {

        ArrayList<Attribute> atts;
        ArrayList<String> attVals = new ArrayList<String>();
        Instances data;
        double[] vals;

        atts = new ArrayList<Attribute>(); // Set up attributes

        HashSet<String> hasGnames = new HashSet<String>(); // create a HashSet with all class graph names 
        hasGnames.addAll(Arrays.asList(file.getObjectList("cg")));
        for (String index : hasGnames) {
            atts.add(new Attribute(index));
            attVals.add(index);
        }

        atts.add(new Attribute("Class", attVals)); // fill the attribute with the given class graph name

        data = new Instances("label Set for Category Classification ", atts, 0);//create Instances object

        //fill with data

        String[] vectorTable = Ivector.trim().split(",");

        vals = new double[data.numAttributes()];
        int count = 0;
        for (String value : vectorTable) { //for each vector
            vals[count] = Double.parseDouble(value);
            count++;
        }

        vals[count] = Utils.missingValue(); //add missingValue in place for the class graph name

        data.add(new DenseInstance(1.0, vals)); // add data to Instance

        return data;

    }
}