Example usage for weka.core Instances attribute

List of usage examples for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name) 

Source Link

Document

Returns an attribute given its name.

Usage

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * Prints left side of condition satisfied by instances.
 *
 * @param data the data.//from www.j a  va  2s. c o  m
 */
public String leftSide(Instances data) {
    return data.attribute(m_attIndex).name();
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * Prints condition satisfied by instances in subset index.
 *///from   w w  w . ja va2 s . c  o m
public final String rightSide(int index, Instances data) {

    StringBuffer text;

    text = new StringBuffer();
    if (data.attribute(m_attIndex).isNominal())
        if (index == 0)
            text.append(" = " + data.attribute(m_attIndex).value((int) m_splitPoint));
        else
            text.append(" != " + data.attribute(m_attIndex).value((int) m_splitPoint));
    else if (index == 0)
        text.append(" <= " + Utils.doubleToString(m_splitPoint, 6));
    else
        text.append(" > " + Utils.doubleToString(m_splitPoint, 6));
    return text.toString();
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * Returns a string containing java source code equivalent to the test
 * made at this node. The instance being tested is called "i".
 *
 * @param index index of the nominal value tested
 * @param data the data containing instance structure info
 * @return a value of type 'String'/*from w  w w.  j  av a  2  s  .  c  om*/
 */
public final String sourceExpression(int index, Instances data) {

    StringBuffer expr = null;
    if (index < 0) {
        return "i[" + m_attIndex + "] == null";
    }
    if (data.attribute(m_attIndex).isNominal()) {
        if (index == 0)
            expr = new StringBuffer("i[");
        else
            expr = new StringBuffer("!i[");
        expr.append(m_attIndex).append("]");
        expr.append(".equals(\"").append(data.attribute(m_attIndex).value((int) m_splitPoint)).append("\")");
    } else {
        expr = new StringBuffer("((Double) i[");
        expr.append(m_attIndex).append("])");
        if (index == 0) {
            expr.append(".doubleValue() <= ").append(m_splitPoint);
        } else {
            expr.append(".doubleValue() > ").append(m_splitPoint);
        }
    }
    return expr.toString();
}

From source file:j48.GraftSplit.java

License:Open Source License

/**
 * method for returning information about this GraftSplit
 * @param data instances for determining names of attributes and values
 * @return a string showing this GraftSplit's information
 *//* w  w w.  j a  va  2s . com*/
public String toString(Instances data) {

    String theTest;
    if (m_testType == 0)
        theTest = " <= ";
    else if (m_testType == 1)
        theTest = " > ";
    else if (m_testType == 2)
        theTest = " = ";
    else
        theTest = " != ";

    if (data.attribute(m_attIndex).isNominal())
        theTest += data.attribute(m_attIndex).value((int) m_splitPoint);
    else
        theTest += Double.toString(m_splitPoint);

    return data.attribute(m_attIndex).name() + theTest + " (" + Double.toString(m_laplace) + ") --> "
            + data.attribute(data.classIndex()).value(m_maxClass);
}

From source file:j48.NBTreeSplit.java

License:Open Source License

/**
 * Creates a NBTree-type split on the given data. Assumes that none of
 * the class values is missing.//ww  w . j av a2  s .c o m
 *
 * @exception Exception if something goes wrong
 */
public void buildClassifier(Instances trainInstances) throws Exception {

    // Initialize the remaining instance variables.
    m_numSubsets = 0;
    m_splitPoint = Double.MAX_VALUE;
    m_errors = 0;
    if (m_globalNB != null) {
        m_errors = m_globalNB.getErrors();
    }

    // Different treatment for enumerated and numeric
    // attributes.
    if (trainInstances.attribute(m_attIndex).isNominal()) {
        m_complexityIndex = trainInstances.attribute(m_attIndex).numValues();
        handleEnumeratedAttribute(trainInstances);
    } else {
        m_complexityIndex = 2;
        trainInstances.sort(trainInstances.attribute(m_attIndex));
        handleNumericAttribute(trainInstances);
    }
}

From source file:jjj.asap.sas.datasets.job.Import.java

License:Open Source License

private void buildDataset(int k, String input, String output) {

    if (IOUtils.exists(output)) {
        Job.log("NOTE", output + " already exists - nothing to do.");
        return;//from  w  w  w.  j a v  a  2  s. co  m
    }

    // create empty dataset
    final DatasetBuilder builder = new DatasetBuilder();
    builder.addVariable("id");
    if (Contest.isMultiChoice(k)) {
        builder.addNominalVariable("color", Contest.COLORS);
    }
    builder.addStringVariable("text");
    builder.addNominalVariable("score", Contest.getRubrics(k));
    Instances dataset = builder.getDataset(IOUtils.getName(output));

    // now add obs
    Iterator<String> it = new FileIterator(input);
    while (it.hasNext()) {
        // parse data
        String[] data = StringUtils.safeSplit(it.next(), "\t", 6);
        double id = Double.parseDouble(data[0]);
        String score = data[2];
        String color = data[4];
        String text = data[5];

        // add to dataset
        dataset.add(new DenseInstance(dataset.numAttributes()));
        Instance ob = dataset.lastInstance();
        ob.setValue(dataset.attribute("id"), id);
        if (Contest.isMultiChoice(k)) {
            ob.setValue(dataset.attribute("color"), color);
        }
        ob.setValue(dataset.attribute("text"), text);
        if ("?".equals(score)) {
            ob.setValue(dataset.attribute("score"), Utils.missingValue());
        } else {
            ob.setValue(dataset.attribute("score"), score);
        }
    }

    Dataset.save(output, dataset);
}

From source file:jjj.asap.sas.parser.job.ImportParserData.java

License:Open Source License

private void process(final String parent, int essaySet, Map<Double, List<String>> tags,
        Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) {

    // check if output exists
    boolean any = false;

    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff"))
        any = true;/* w ww.  j  a v  a  2s . c  o  m*/
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff"))
        any = true;
    if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff"))
        any = true;

    if (!any) {
        Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet
                + "-*.arff returns all required datasets - nothing to do");
        return;
    }

    // Load an existing dataset to use as a template.
    Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff");

    // create the output datasets here. except for the extra statistics, 
    // the format is the same as 'dataset'.

    Instances tagsData = new Instances(dataset, 0);
    tagsData.setRelationName(essaySet + "-pos-tags.arff");
    Instances treeData = new Instances(dataset, 0);
    treeData.setRelationName(essaySet + "-parse-tree.arff");

    Instances dependsData[] = new Instances[7];
    for (int j = 0; j < 7; j++) {
        dependsData[j] = new Instances(dataset, 0);
        dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff");
    }

    // extra stats
    DatasetBuilder builder = new DatasetBuilder();
    builder.addVariable("id");
    if (Contest.isMultiChoice(essaySet)) {
        builder.addNominalVariable("color", Contest.COLORS);
    }
    builder.addVariable("x_sent");
    builder.addVariable("x_para");
    builder.addVariable("x_length");
    builder.addVariable("x_words");
    builder.addVariable("x_unique_words");
    builder.addNominalVariable("score", Contest.getRubrics(essaySet));

    Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff");

    // now add rows for each instance

    for (int i = 0; i < dataset.numInstances(); i++) {

        // common variables
        Instance ob = dataset.instance(i);
        double id = ob.value(0);
        String y = ob.isMissing(dataset.numAttributes() - 1) ? null
                : ob.stringValue(dataset.numAttributes() - 1);
        String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null;
        String str = ob.stringValue(dataset.attribute("text"));

        //
        // Extra stats
        //

        int nSent = tags.containsKey(id) ? tags.get(id).size() : 0;
        int nPara = 0;
        for (int a = 0; a < str.length(); a++) {
            if (str.charAt(a) == '^')
                nPara++;
        }
        int nLength = str.length();
        int nWords = 0;
        int nUniqueWords = 0;
        String[] words = str.toLowerCase().split(" ");
        nWords = words.length;
        Set<String> u = new HashSet<String>();
        for (String w : words) {
            u.add(w);
        }
        nUniqueWords = u.size();

        extraStats.add(new DenseInstance(extraStats.numAttributes()));
        Instance extra = extraStats.lastInstance();
        extra.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            extra.setValue(1, color);
        }

        extra.setValue(extraStats.attribute("x_sent"), nSent);
        extra.setValue(extraStats.attribute("x_para"), nPara);
        extra.setValue(extraStats.attribute("x_length"), nLength);
        extra.setValue(extraStats.attribute("x_words"), nWords);
        extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords);

        if (y == null)
            extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue());
        else
            extra.setValue(extraStats.numAttributes() - 1, y);

        //
        // POS tags
        //

        String tagsText = "";
        List<String> tagsList = tags.get(id);
        if (tagsList == null || tagsList.isEmpty()) {
            Job.log("WARNING", "no tags for " + id);
            tagsText = "x";
        } else {
            for (String tagsItem : tagsList) {
                tagsText += tagsItem;
            }
        }

        tagsData.add(new DenseInstance(ob.numAttributes()));
        Instance tagsOb = tagsData.lastInstance();
        tagsOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            tagsOb.setValue(1, color);
            tagsOb.setValue(2, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(3, Utils.missingValue());
            } else {
                tagsOb.setValue(3, y);
            }
        } else {
            tagsOb.setValue(1, tagsText.trim());
            if (y == null) {
                tagsOb.setValue(2, Utils.missingValue());
            } else {
                tagsOb.setValue(2, y);
            }
        }

        //
        // Parse Tree
        //

        String treeText = "";
        List<String> treeList = parseTrees.get(id);
        if (treeList == null || treeList.isEmpty()) {
            Job.log("WARNING", "no parse tree for " + id);
            treeText = "x";
        } else {
            for (String treeItem : treeList) {
                treeText += treeItem;
            }
        }

        treeData.add(new DenseInstance(ob.numAttributes()));
        Instance treeOb = treeData.lastInstance();
        treeOb.setValue(0, id);
        if (Contest.isMultiChoice(essaySet)) {
            treeOb.setValue(1, color);
            treeOb.setValue(2, treeText.trim());
            if (y == null) {
                treeOb.setValue(3, Utils.missingValue());
            } else {
                treeOb.setValue(3, y);
            }
        } else {
            treeOb.setValue(1, treeText.trim());
            if (y == null) {
                treeOb.setValue(2, Utils.missingValue());
            } else {
                treeOb.setValue(2, y);
            }
        }

        //
        // Depends data
        //

        for (int j = 0; j < 7; j++) {

            String text = "";
            List<String> list = depends.get(id);
            if (list == null || list.isEmpty()) {
                Job.log("WARNING", "no depends for " + id);
                text = "x";
            } else {
                for (String item : list) {
                    String[] term = StringUtils.safeSplit(item, "/", 3);
                    switch (j) {
                    case 0:
                        text += item;
                        break;
                    case 1:
                        text += term[1] + "/" + term[2];
                        break;
                    case 2:
                        text += term[0] + "/" + term[2];
                        break;
                    case 3:
                        text += term[0] + "/" + term[1];
                        break;
                    case 4:
                        text += term[0];
                        break;
                    case 5:
                        text += term[1];
                        break;
                    case 6:
                        text += term[2];
                        break;
                    }
                    text += " ";
                }
            }

            dependsData[j].add(new DenseInstance(ob.numAttributes()));
            Instance dependsOb = dependsData[j].lastInstance();
            dependsOb.setValue(0, id);
            if (Contest.isMultiChoice(essaySet)) {
                dependsOb.setValue(1, color);
                dependsOb.setValue(2, text.trim());
                if (y == null) {
                    dependsOb.setValue(3, Utils.missingValue());
                } else {
                    dependsOb.setValue(3, y);
                }
            } else {
                dependsOb.setValue(1, text.trim());
                if (y == null) {
                    dependsOb.setValue(2, Utils.missingValue());
                } else {
                    dependsOb.setValue(2, y);
                }
            }

        } // j
    } // dataset

    // Now save the new datasets

    Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData);
    Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData);
    for (int j = 0; j < 7; j++) {
        Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]);
    }
    Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats);

}

From source file:jmetal.problems.SurvivalAnalysis.java

License:Open Source License

/** 
 * Evaluates a solution //from  ww w .  ja  v a  2  s. co m
 * @param solution The solution to evaluate
 */
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    double ArithmeticHarmonicCutScore = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        // filtering complete

        /*
        // debug: write the filtered dataset
                
         ArffSaver saver = new ArffSaver();
         saver.setInstances(dataClusterer);
         saver.setFile(new File("filteered-data.arff"));
         saver.writeBatch();
        // end debug
                
        */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", this.HC_LinkType }); // complete linkage clustering
        //Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
        //[SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING]

        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setDistanceIsBranchLength(false); // ?? Should it be changed to false? (Noman)

        clusterer.buildClusterer(dataClusterer);

        double[][] distanceMatrix = clusterer.getDistanceMatrix();
        // save the cluster assignments

        if (this.re == null) { // we are not calling R functions. Therefore parallelization possible

            int[] clusterAssignment = new int[dataClusterer.numInstances()];
            int classOneCnt = 0;
            int classTwoCnt = 0;
            for (int i = 0; i < dataClusterer.numInstances(); ++i) {
                clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
                if (clusterAssignment[i] == 0) {
                    ++classOneCnt;
                } else if (clusterAssignment[i] == 1) {
                    ++classTwoCnt;
                }
                //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
            }

            //System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);

            // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
            double[] time1 = new double[classOneCnt];
            double[] censor1 = new double[classOneCnt];
            double[] time2 = new double[classTwoCnt];
            double[] censor2 = new double[classTwoCnt];

            //data = source.getDataSet();
            for (int i = 0, cnt1 = 0, cnt2 = 0; i < dataClusterer.numInstances(); ++i) {
                //clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
                if (clusterAssignment[i] == 0) {
                    time1[cnt1] = data.get(i).value(attTime);
                    censor1[cnt1++] = data.get(i).value(attCensor);
                    //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
                } else if (clusterAssignment[i] == 1) {
                    time2[cnt2] = data.get(i).value(attTime);
                    //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                    censor2[cnt2++] = data.get(i).value(attCensor);
                    ;
                }
                //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
            }

            //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
            //System.out.println("Class instances seperated");

            // calculate log rank test and p values

            LogRankTest testclass1 = new LogRankTest(time1, time2, censor1, censor2);
            double[] scores = testclass1.logRank();
            testStatistic = scores[0];
            pValue = scores[2];

            ArithmeticHarmonicCutScore = this.getArithmeticHarmonicCutScore(distanceMatrix, clusterAssignment);
            //debug:
            //System.out.println("Calculation by myLibrary: testStatistic: " + scores[0] + " pValue: " + scores[2]);
            //end debug
            //WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
            //testStatistic = testclass1.testStatistic;
            //pValue = testclass1.pValue;true
        } else { // We are calling R for Log Rank test, Parallelization not possible

            String strT = "time <- c(";
            String strC = "censor <- c(";
            String strG = "group <- c(";

            for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
                strT = strT + (int) data.get(i).value(attTime) + ",";
                strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
                strC = strC + (int) data.get(i).value(attCensor) + ",";
            }

            int tmpi = dataClusterer.numInstances() - 1;
            strT = strT + (int) data.get(tmpi).value(attTime) + ")";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
            strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

            this.re.eval(strT);
            this.re.eval(strC);
            this.re.eval(strG);

            //debug
            //System.out.println(strT);
            //System.out.println(strC);
            //System.out.println(strG);
            //end debug

            /** If you are calling surv_test from coin library */
            /*v
            re.eval("library(coin)");
            re.eval("grp <- factor (group)");
            re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                    
            x=re.eval("statistic(result)");
            testStatistic = x.asDouble();
            //x=re.eval("pvalue(result)");
            //pValue = x.asDouble();
            //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
             */

            /** If you are calling survdiff from survival library (much faster) */
            re.eval("library(survival)");
            re.eval("res2 <- survdiff(Surv(time,censor)~group,rho=0)");
            x = re.eval("res2$chisq");
            testStatistic = x.asDouble();
            //System.out.println(x);
            x = re.eval("pchisq(res2$chisq, df=1, lower.tail = FALSE)");
            //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
            pValue = x.asDouble();
            //debug:
            //System.out.println("Calculation by R: StatScore: " + testStatistic + "pValue: " + pValue);
            //end debug

        }

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

    /**********
     *  Current Implementation considers two objectives
     *  1. pvalue to be minimized / statistical score to be maximized
     *  2. Number of Features to be maximized/minimized
     */

    // Currently this section implements the OneZeroMax problem - need to modify it
    for (int i = 0; i < variable.getNumberOfBits(); i++)
        if (variable.bits_.get(i))
            counterSelectedFeatures++;

    // OneZeroMax is a maximization problem: multiply by -1 to minimize
    /*
    if (Double.isNaN(testStatistic)){
       solution.setObjective(0,Double.MAX_VALUE);
    }
    else{
       solution.setObjective(0, testStatistic);
    }
    */

    if (this.pValueFlag) {
        solution.setObjective(0, pValue); // pValue to be minimized
    } else {
        solution.setObjective(0, -1.0 * testStatistic); // statistic score to be maximized
    }
    if (this.featureMax) {
        solution.setObjective(1, -1.0 * counterSelectedFeatures); // feature maximized
    } else {
        solution.setObjective(1, counterSelectedFeatures); // feature minimized
    }
    if (this.numberOfObjectives_ == 3) {
        solution.setObjective(2, -1.0 * ArithmeticHarmonicCutScore); // feature maximized
    }
}

From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraph.java

License:Open Source License

/** 
 * Evaluates a solution /*from  ww  w .  java 2  s .  c  o m*/
 * @param solution The solution to evaluate
 */
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    double ArithmeticHarmonicCutScore = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        // filtering complete

        // List the selected features/attributes
        Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes();
        System.out.println("Selected attributes/features: ");
        while (attributeList.hasMoreElements()) {
            Attribute att = attributeList.nextElement();
            System.out.print(att.name() + ",");
        }

        System.out.println();

        /*
        // debug: write the filtered dataset
                
         ArffSaver saver = new ArffSaver();
         saver.setInstances(dataClusterer);
         saver.setFile(new File("filteered-data.arff"));
         saver.writeBatch();
        // end debug
                
        */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", this.HC_LinkType });
        //Link type (Single, Complete, Average, Mean, Centroid, Ward, Adjusted complete, Neighbor Joining)
        //[SINGLE|COMPLETE|AVERAGE|MEAN|CENTROID|WARD|ADJCOMPLETE|NEIGHBOR_JOINING]

        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setDistanceIsBranchLength(false); // ?? Should it be changed to false? (Noman)

        clusterer.buildClusterer(dataClusterer);

        double[][] distanceMatrix = clusterer.getDistanceMatrix();

        // Cluster evaluation:
        ClusterEvaluation eval = new ClusterEvaluation();
        eval.setClusterer(clusterer);

        if (this.testDataFileName != null) {

            DataSource testSource = new DataSource(this.testDataFileName);

            Instances tmpTestData = testSource.getDataSet();
            tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1);
            //testSource.

            // First filter the attributes based on chromosome
            Instances testData = this.filterByChromosome(tmpTestData, solution);
            //String[] options = new String[2];
            //options[0] = "-t";
            //options[1] = "/some/where/somefile.arff";
            //eval.
            //System.out.println(eval.evaluateClusterer(testData, options));
            eval.evaluateClusterer(testData);
            System.out.println("\nCluster evluation for this solution(" + this.testDataFileName + "): "
                    + eval.clusterResultsToString());
        }

        // First analyze using my library function

        // save the cluster assignments

        int[] clusterAssignment = new int[dataClusterer.numInstances()];
        int classOneCnt = 0;
        int classTwoCnt = 0;
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                ++classOneCnt;
            } else if (clusterAssignment[i] == 1) {
                ++classTwoCnt;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);

        // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
        double[] time1 = new double[classOneCnt];
        double[] censor1 = new double[classOneCnt];
        double[] time2 = new double[classTwoCnt];
        double[] censor2 = new double[classTwoCnt];

        //data = source.getDataSet();
        for (int i = 0, cnt1 = 0, cnt2 = 0; i < dataClusterer.numInstances(); ++i) {
            //clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                time1[cnt1] = data.get(i).value(attTime);
                censor1[cnt1++] = data.get(i).value(attCensor);
                //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
            } else if (clusterAssignment[i] == 1) {
                time2[cnt2] = data.get(i).value(attTime);
                //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                censor2[cnt2++] = data.get(i).value(attCensor);
                ;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
        //System.out.println("Class instances seperated");

        // calculate log rank test and p values

        LogRankTest testclass1 = new LogRankTest(time1, time2, censor1, censor2);
        double[] scores = testclass1.logRank();
        testStatistic = scores[0];
        pValue = scores[2];

        ArithmeticHarmonicCutScore = this.getArithmeticHarmonicCutScore(distanceMatrix, clusterAssignment);
        //debug:
        System.out.println("Calculation by myLibrary:\n testStatistic: " + scores[0] + " pValue: " + scores[2]
                + " Arithmetic Harmonic Cut Score: " + ArithmeticHarmonicCutScore);
        //end debug
        //WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
        //testStatistic = testclass1.testStatistic;
        //pValue = testclass1.pValue;true

        // Now analyze calling R for Log Rank test, Parallelization not possible

        String strT = "time <- c(";
        String strC = "censor <- c(";
        String strG = "group <- c(";

        for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";
        }

        int tmpi = dataClusterer.numInstances() - 1;
        strT = strT + (int) data.get(tmpi).value(attTime) + ")";
        strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
        strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

        this.re.eval(strT);
        this.re.eval(strC);
        this.re.eval(strG);

        //debug
        //System.out.println(strT);
        //System.out.println(strC);
        //System.out.println(strG);
        //end debug

        /** If you are calling surv_test from coin library */
        /*v
        re.eval("library(coin)");
        re.eval("grp <- factor (group)");
        re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                
        x=re.eval("statistic(result)");
        testStatistic = x.asDouble();
        //x=re.eval("pvalue(result)");
        //pValue = x.asDouble();
        //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
         */

        /** If you are calling survdiff from survival library (much faster) */
        re.eval("library(survival)");
        re.eval("res2 <- survdiff(Surv(time,censor)~group,rho=0)");
        x = re.eval("res2$chisq");
        testStatistic = x.asDouble();
        //System.out.println(x);
        x = re.eval("pchisq(res2$chisq, df=1, lower.tail = FALSE)");
        //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
        pValue = x.asDouble();
        //debug:
        //System.out.println("Calculation by R: StatScore: " + testStatistic + "pValue: " + pValue);
        //end debug

        System.out.println("Calculation by R:");
        System.out.println("StatScore: " + testStatistic + "  pValue: " + pValue);

        re.eval("timestrata1.surv <- survfit( Surv(time, censor)~ strata(group), conf.type=\"log-log\")");
        re.eval("timestrata1.surv1 <- survfit( Surv(time, censor)~ 1, conf.type=\"none\")");
        String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')";
        re.eval(evalStr);
        re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")");
        re.eval("par(new=T)");
        re.eval("plot(timestrata1.surv1,col=1)");
        re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))");
        re.eval("dev.off()");

        System.out.println("\nCluster Assignments:");
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

}

From source file:jmetal.test.survivalanalysis.GenerateSurvivalGraphOld.java

License:Open Source License

/** 
 * Evaluates a solution - actually generate the survival graph 
 * @param solution The solution to evaluate
 *//*from   w ww . j  a v  a2s . co m*/
public void evaluate(Solution solution) {
    Binary variable;
    int counterSelectedFeatures;

    DataSource source;

    double testStatistic = Double.MAX_VALUE;
    double pValue = Double.MAX_VALUE;
    //double statScore;
    REXP x;

    variable = ((Binary) solution.getDecisionVariables()[0]);

    counterSelectedFeatures = 0;

    System.out.println("\nSolution ID " + this.SolutionID);

    try {
        // read the data file 
        source = new DataSource(this.dataFileName);
        Instances data = source.getDataSet();
        //System.out.print("Data read successfully. ");
        //System.out.print("Number of attributes: " + data.numAttributes());
        //System.out.println(". Number of instances: " + data.numInstances());

        // save the attribute 'T' and 'Censor'
        attTime = data.attribute(data.numAttributes() - 2);
        attCensor = data.attribute(data.numAttributes() - 1);

        // First filter the attributes based on chromosome
        Instances tmpData = this.filterByChromosome(data, solution);

        // Now filter the attribute 'T' and 'Censor'
        Remove filter = new Remove();
        // remove the two last attributes : 'T' and 'Censor'
        filter.setAttributeIndices("" + (tmpData.numAttributes() - 1) + "," + tmpData.numAttributes());
        //System.out.println("After chromosome filtering no of attributes: " + tmpData.numAttributes());
        filter.setInputFormat(tmpData);
        Instances dataClusterer = Filter.useFilter(tmpData, filter);

        Enumeration<Attribute> attributeList = dataClusterer.enumerateAttributes();
        System.out.println("Selected attributes: ");
        while (attributeList.hasMoreElements()) {
            Attribute att = attributeList.nextElement();
            System.out.print(att.name() + ",");
        }

        System.out.println();
        // filtering complete

        // Debug: write the filtered dataset
        /*
        ArffSaver saver = new ArffSaver();
        saver.setInstances(dataClusterer);
        saver.setFile(new File("filteered-data.arff"));
        saver.writeBatch();
         */

        // train hierarchical clusterer

        HierarchicalClusterer clusterer = new HierarchicalClusterer();
        clusterer.setOptions(new String[] { "-L", "COMPLETE" }); // complete linkage clustering
        //clusterer.setDebug(true);
        clusterer.setNumClusters(2);
        clusterer.setDistanceFunction(new EuclideanDistance());
        //clusterer.setDistanceFunction(new ChebyshevDistance());
        clusterer.setDistanceIsBranchLength(false);

        clusterer.buildClusterer(dataClusterer);

        // Cluster evaluation:
        ClusterEvaluation eval = new ClusterEvaluation();
        eval.setClusterer(clusterer);

        if (this.testDataFileName != null) {

            DataSource testSource = new DataSource(this.testDataFileName);

            Instances tmpTestData = testSource.getDataSet();
            tmpTestData.setClassIndex(tmpTestData.numAttributes() - 1);
            //testSource.

            // First filter the attributes based on chromosome
            Instances testData = this.filterByChromosome(tmpTestData, solution);
            //String[] options = new String[2];
            //options[0] = "-t";
            //options[1] = "/some/where/somefile.arff";
            //eval.
            //System.out.println(eval.evaluateClusterer(testData, options));
            eval.evaluateClusterer(testData);
            System.out.println("\nCluster evluation for this solution: " + eval.clusterResultsToString());
        }

        // Print the cluster assignments:

        // save the cluster assignments
        //if (printClusterAssignment==true){
        int[] clusterAssignment = new int[dataClusterer.numInstances()];
        int classOneCnt = 0;
        int classTwoCnt = 0;
        for (int i = 0; i < dataClusterer.numInstances(); ++i) {
            clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
            if (clusterAssignment[i] == 0) {
                ++classOneCnt;
            } else if (clusterAssignment[i] == 1) {
                ++classTwoCnt;
            }
            //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
        }

        System.out.println("Class 1 cnt: " + classOneCnt + " Class 2 cnt: " + classTwoCnt);
        //}

        /*
                
                         
                 // create arrays with time (event occurrence time) and censor data for use with jstat LogRankTest
                 double[] time1 = new double[classOneCnt];   
                 double[] censor1 = new double[classOneCnt];
                 double[] time2 = new double[classTwoCnt];
                 double[] censor2 = new double[classTwoCnt];
                
                
                 //data = source.getDataSet();
                 for (int i=0, cnt1=0, cnt2=0; i<dataClusterer.numInstances(); ++i){
                    clusterAssignment[i] = clusterer.clusterInstance(dataClusterer.get(i));
                    if (clusterAssignment[i]==0){
                       time1[cnt1] = data.get(i).value(attTime);
                       censor1[cnt1++] = 1;
                       //System.out.println("i: " + i + " T: " + time1[cnt1-1]);
                    }
                    else if (clusterAssignment[i]==1){
                       time2[cnt2] = data.get(i).value(attTime);
                       //System.out.println("i: " + i + " T: " + time2[cnt2-1]);
                       censor2[cnt2++] = 1;
                    }
                    //System.out.println("Instance " + i + ": " + clusterAssignment[i]);
                 }
                
                
                
                 //Instances[] classInstances = separateClassInstances(clusterAssignment, this.dataFileName,solution);
                 //System.out.println("Class instances seperated");
                
                 // calculate log rank test and p values
                         
                 //LogRankTest testclass1 = new LogRankTest(time1, censor1, time2, censor2);
                 //testStatistic = testclass1.testStatistic;
                 //pValue = testclass1.pValue;
                
                
                 WilcoxonTest testclass1 = new WilcoxonTest(time1, censor1, time2, censor2);
                 testStatistic = testclass1.testStatistic;
                 pValue = testclass1.pValue;true
        */

        String strT = "time1 <- c(";
        String strC = "censor1 <- c(";
        String strG = "group1 <- c(";

        for (int i = 0; i < dataClusterer.numInstances() - 1; ++i) {
            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";

        }

        int tmpi = dataClusterer.numInstances() - 1;
        strT = strT + (int) data.get(tmpi).value(attTime) + ")";
        strG = strG + clusterer.clusterInstance(dataClusterer.get(tmpi)) + ")";
        strC = strC + (int) data.get(tmpi).value(attCensor) + ")";

        this.re.eval(strT);
        this.re.eval(strC);
        this.re.eval(strG);

        // for MyLogRankTest

        double[] time1 = new double[classOneCnt];
        double[] time2 = new double[classTwoCnt];
        double[] censor1 = new double[classOneCnt];
        double[] censor2 = new double[classTwoCnt];

        int i1 = 0, i2 = 0;

        for (int i = 0; i < dataClusterer.numInstances(); ++i) {

            strT = strT + (int) data.get(i).value(attTime) + ",";
            strG = strG + clusterer.clusterInstance(dataClusterer.get(i)) + ",";
            strC = strC + (int) data.get(i).value(attCensor) + ",";

            if (clusterer.clusterInstance(dataClusterer.get(i)) == 0) {
                time1[i1] = data.get(i).value(attTime);
                censor1[i1] = data.get(i).value(attCensor);
                ++i1;
            } else {
                time2[i2] = data.get(i).value(attTime);
                censor2[i2] = data.get(i).value(attCensor);
                ++i2;
            }

        }

        /** If you are calling surv_test from coin library */
        /*v
        re.eval("library(coin)");
        re.eval("grp <- factor (group)");
        re.eval("result <- surv_test(Surv(time,censor)~grp,distribution=\"exact\")");
                
        x=re.eval("statistic(result)");
        testStatistic = x.asDouble();
        //x=re.eval("pvalue(result)");
        //pValue = x.asDouble();
        //System.out.println("StatScore: " + statScore + "pValue: " + pValue);
        */

        /** If you are calling survdiff from survival library (much faster) */
        re.eval("library(survival)");
        re.eval("res21 <- survdiff(Surv(time1,censor1)~group1,rho=0)");
        x = re.eval("res21$chisq");
        testStatistic = x.asDouble();
        //System.out.println(x);
        x = re.eval("pchisq(res21$chisq, df=1, lower.tail = FALSE)");
        //x = re.eval("1.0 - pchisq(res2$chisq, df=1)");
        pValue = x.asDouble();
        System.out.println("Results from R:");
        System.out.println("StatScore: " + testStatistic + "  pValue: " + pValue);

        re.eval("timestrata1.surv <- survfit( Surv(time1, censor1)~ strata(group1), conf.type=\"log-log\")");
        re.eval("timestrata1.surv1 <- survfit( Surv(time1, censor1)~ 1, conf.type=\"none\")");
        String evalStr = "jpeg('SurvivalPlot-" + this.SolutionID + ".jpg')";
        re.eval(evalStr);
        re.eval("plot(timestrata1.surv, col=c(2,3), xlab=\"Time\", ylab=\"Survival Probability\")");
        re.eval("par(new=T)");
        re.eval("plot(timestrata1.surv1,col=1)");
        re.eval("legend(0.2, c(\"Group1\",\"Group2\",\"Whole\"))");
        re.eval("dev.off()");

        System.out.println("Results from my code: ");
        LogRankTest lrt = new LogRankTest(time1, time2, censor1, censor2);
        double[] results = lrt.logRank();
        System.out.println("Statistics: " + results[0] + " variance: " + results[1] + " pValue: " + results[2]);

    } catch (Exception e) {
        // TODO Auto-generated catch block
        System.err.println("Can't open the data file.");
        e.printStackTrace();
        System.exit(1);
    }

    /**********
     *  Current Implementation considers two objectives
     *  1. pvalue to be minimized / statistical score to be maximized
     *  2. Number of Features to be maximized/minimized
     */

}