Example usage for weka.core Instances toString

List of usage examples for weka.core Instances toString

Introduction

In this page you can find the example usage for weka.core Instances toString.

Prototype

@Override
public String toString() 

Source Link

Document

Returns the dataset as a string in ARFF format.

Usage

From source file:mao.datamining.DataSetPair.java

private void createSampleDataSets() {
    try {//from  w w  w  .  j  av a 2s  .  c o  m
        //reload the new data from new arff file: Main.OrangeProcessedDSHome+"/afterRemoveRows.arff"
        Instances newData = ConverterUtils.DataSource
                .read(Main.OrangeProcessedDSHome + "/afterRemoveRows2.arff");
        newData.setClassIndex(newData.numAttributes() - 1);
        //create none sample file
        //            Main.logging("== New Data After Doing Nothing, waiting for CostMatrix: ===\n" + newData.toSummaryString());
        try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(Main.OrangeProcessedDSHome + "/afterNoneSampling.arff")))) {
            writer.write(newData.toString());
        }
        //create under sample file
        //            System.out.println("Under Samplessssssssssssssssssssssssssssssssssssss");
        SpreadSubsample underSampleFilter = new weka.filters.supervised.instance.SpreadSubsample();
        underSampleFilter.setInputFormat(newData);
        String underOptionsClone[] = new String[underSampleFilterOptions.length];
        System.arraycopy(underSampleFilterOptions, 0, underOptionsClone, 0, underSampleFilterOptions.length);
        underSampleFilter.setOptions(underOptionsClone);
        Instances underNewData = Filter.useFilter(newData, underSampleFilter);

        //            Main.logging("== New Data After Under Sampling: ===\n" + underNewData.toSummaryString());
        try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(Main.OrangeProcessedDSHome + "/afterUnderSampling.arff")))) {
            writer.write(underNewData.toString());
        }
        //create over sample file
        //            System.out.println("Over Samplessssssssssssssssssssssssssssssssssssss");
        //weka.filters.supervised.instance.SMOTE -C 0 -K 5 -P 1000.0 -S 1 smoteOptions
        SMOTE smote = new weka.filters.supervised.instance.SMOTE();
        smote.setInputFormat(newData);
        String overOptionsClone[] = new String[overSampleSmoteOptions.length];
        System.arraycopy(overSampleSmoteOptions, 0, overOptionsClone, 0, overSampleSmoteOptions.length);
        smote.setOptions(overOptionsClone);
        Instances overNewData = Filter.useFilter(newData, smote);

        //            Main.logging("== New Data After Over Sampling: ===\n" + overNewData.toSummaryString());
        try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(
                new FileOutputStream(Main.OrangeProcessedDSHome + "/afterOverSampling.arff")))) {
            writer.write(overNewData.toString());
        }

    } catch (Exception ex) {
        Logger.getLogger(DataSetPair.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:mao.datamining.DataSetPair.java

/**
 * Pre-Process the training data set with:
 * RemoveUselessColumnsByMissingValues filter
 * SpreadSubsample filter to shrink the majority class instances 
 * AttributeSelection filter with CfsSubsetEval and LinearForwardSelection
 *//* ww w.  j a  va2 s .  co  m*/
private void processTrainRawData() {
    System.out.println("====================" + this.trainFileName + "====================");
    System.out.println("====================" + this.trainFileName + "====================");
    System.out.println("====================" + this.trainFileName + "====================");
    finalTrainAttrList.clear();
    try {
        doItOnce4All();
        String sampleFilePath = null;
        //step 2, either over sample, or under sample
        //weka.filters.supervised.instance.SpreadSubsample
        if (this.resampleMethod.equalsIgnoreCase(resampleUnder)) {
            System.out.println("Under Samplessssssssssssssssssssssssssssssssssssss");
            sampleFilePath = Main.OrangeProcessedDSHome + "/afterUnderSampling.arff";
        } else if (resampleMethod.equalsIgnoreCase(resampleOver)) {
            System.out.println("Over Samplessssssssssssssssssssssssssssssssssssss");
            sampleFilePath = Main.OrangeProcessedDSHome + "/afterOverSampling.arff";
        } else if (resampleMethod.equalsIgnoreCase(resampleNone)) {
            //do nothing,
            System.out.println("None Samplessssssssssssssssssssssssssssssssssssss");
            sampleFilePath = Main.OrangeProcessedDSHome + "/afterNoneSampling.arff";
        } else if (resampleMethod.equalsIgnoreCase(resampleMatrix)) {
            //do nothing
            System.out.println("Matrix Samplessssssssssssssssssssssssssssssssssssss");
            sampleFilePath = Main.OrangeProcessedDSHome + "/afterNoneSampling.arff";
        } else {
            doNotSupport();
        }
        Instances newData = ConverterUtils.DataSource.read(sampleFilePath);
        newData.setClassIndex(newData.numAttributes() - 1);
        //            Main.logging("== New Data After Resampling class instances: ===\n" + newData.toSummaryString());

        //Step 3, select features
        AttributeSelection attrSelectionFilter = new AttributeSelection();
        ASEvaluation eval = null;
        ASSearch search = null;

        //ranker
        if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionA)) {
            System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss");
            System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss");
            System.out.println("Ranker ssssssssssssssssssssssssssssssssssssss");
            eval = new weka.attributeSelection.InfoGainAttributeEval();
            //weka.attributeSelection.Ranker -T 0.02 -N -1
            search = new Ranker();
            String rankerOptios[] = { "-T", "0.01", "-N", "-1" };
            if (resampleMethod.equalsIgnoreCase(resampleOver)) {
                rankerOptios[1] = "0.1";
            }
            ((Ranker) search).setOptions(rankerOptios);
            Main.logging("== Start to Select Features with InfoGainAttributeEval and Ranker");
        }
        //weka.attributeSelection.LinearForwardSelection -D 0 -N 5 -I -K 50 -T 0
        else if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionB)) {
            System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss");
            System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss");
            System.out.println("CfsSubset ssssssssssssssssssssssssssssssssssssss");
            eval = new CfsSubsetEval();
            search = new LinearForwardSelection();
            String linearOptios[] = { "-D", "0", "-N", "5", "-I", "-K", "50", "-T", "0" };
            ((LinearForwardSelection) search).setOptions(linearOptios);
            Main.logging("== Start to Select Features with CfsSubsetEval and LinearForwardSelection");
        } else if (this.featureSelectionMode.equalsIgnoreCase(featureSelectionNo)) {
            System.out.println("None Selection ssssssssssssssssssssssssssssssssssssss");
            Main.logging("No Feature Selection Method");
        } else {
            doNotSupport();
        }

        if (eval != null) {
            attrSelectionFilter.setEvaluator(eval);
            attrSelectionFilter.setSearch(search);
            attrSelectionFilter.setInputFormat(newData);
            newData = Filter.useFilter(newData, attrSelectionFilter);
        }

        Main.logging("== New Data After Selecting Features: ===\n" + newData.toSummaryString());

        //finally, write the final dataset to file system

        try (BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(this.trainFileName)))) {
            writer.write(newData.toString());
        }

        int numAttributes = newData.numAttributes();
        for (int i = 0; i < numAttributes; i++) {
            String attrName = newData.attribute(i).name();
            finalTrainAttrList.add(attrName);
        }
        Main.logging(finalTrainAttrList.toString());
        //            //set the final train dataset
        finalTrainDataSet = newData;
        finalTrainDataSet.setClassIndex(finalTrainDataSet.numAttributes() - 1);

        Main.logging("train dataset class attr: " + finalTrainDataSet.classAttribute().toString());
    } catch (Exception ex) {
        Main.logging(null, ex);
    }

}

From source file:mao.datamining.DataSetPair.java

/**
 * To drop the useless columns accordingly on the test dataset, if it exists
*//*from  ww  w  .j a v a2 s  .c  o  m*/
private void processTestDataSet() {
    if (!new File(testSourceFileName).exists())
        return;

    try {
        Instances orangeTestDataSet = ConverterUtils.DataSource.read(testSourceFileName);
        Remove remove = new Remove();
        StringBuilder indexBuffer = new StringBuilder();
        for (String attrName : finalTrainAttrList) {
            int attrIndex = orangeTestDataSet.attribute(attrName).index();
            indexBuffer.append(attrIndex + 1).append(",");
        }
        Main.logging("Attribute Indices: \n" + indexBuffer.toString());
        remove.setAttributeIndices(indexBuffer.toString());
        remove.setInvertSelection(true);

        remove.setInputFormat(orangeTestDataSet);
        Instances testNewDataSet = Filter.useFilter(orangeTestDataSet, remove);

        try (BufferedWriter writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(this.testFileName)))) {
            writer.write(testNewDataSet.toString());
        }

        //set the final test dataset
        finalTestDataSet = testNewDataSet;
        finalTestDataSet.setClassIndex(finalTestDataSet.numAttributes() - 1);
        Main.logging("test dataset class attr: " + finalTestDataSet.classAttribute().toString());
    } catch (Exception e) {
        Main.logging(null, e);
    }
}

From source file:mao.datamining.DataSetPair.java

private static void generateDataSetPairs() {

    String matrix[][] = {/*w  w w .j a va  2  s .  c  om*/

            { resampleNone, featureSelectionNo }, { resampleNone, featureSelectionA },
            { resampleNone, featureSelectionB },

            { resampleUnder, featureSelectionNo }, { resampleUnder, featureSelectionA },
            { resampleUnder, featureSelectionB },

            { resampleOver, featureSelectionNo }, { resampleOver, featureSelectionA },
            { resampleOver, featureSelectionB },

            { resampleMatrix, featureSelectionNo }, { resampleMatrix, featureSelectionA },
            { resampleMatrix, featureSelectionB }

    };

    String mergeFilePath = Main.OrangeProcessedDSHome + "/mergedFile.arff";
    DataSetPair ds1 = new DataSetPair();
    for (String[] row : matrix) {
        try {
            ds1.setFeatures(row[0], row[1]);
            ds1.processTrainRawData();
            ds1.processTestDataSet();

            Util.DataSetFiles dsFiles = new Util.DataSetFiles(ds1.getTrainFileName(), ds1.getTestFileName(),
                    mergeFilePath);
            //merge the 2 files
            Util.mergeTrainTestFiles(dsFiles);
            Instances mergeData = null;
            //                //numeric to nominal, to be delete ???
            //                mergeData = ConverterUtils.DataSource.read(mergeFilePath);
            //                Instances transformedDS = Util.transformNum2Nominal(mergeData, columns2Nominal);
            //                try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(mergeFilePath)))) {
            //                    writer.write(transformedDS.toString());
            //                }

            mergeData = ConverterUtils.DataSource.read(mergeFilePath);
            //normalize the 2 files together
            Instances normalizeData = Util.normalizeDS(mergeData);
            try (BufferedWriter writer = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(mergeFilePath)))) {
                writer.write(normalizeData.toString());
            }

            //split them
            Util.splitTrainTestFiles(dsFiles);
        }

        catch (Exception ex) {
            Logger.getLogger(DataSetPair.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

}

From source file:meddle.TrainModelByDomainOS.java

License:Open Source License

public static Instances populateArff(Info info, Map<String, Integer> wordCount,
        ArrayList<Map<String, Integer>> trainMatrix, ArrayList<Integer> PIILabels, int numSamples, int theta) {
    //      System.out.println(info);
    // Mapping feature_name_index
    Map<String, Integer> fi = new HashMap<String, Integer>();
    int index = 0;
    // Populate Features
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();
    int high_freq = trainMatrix.size();

    if (high_freq - theta < 30)
        theta = 0;//from  w  w  w . j  av  a  2  s .  c  om
    for (Map.Entry<String, Integer> entry : wordCount.entrySet()) {
        // filter low frequency word
        String currentWord = entry.getKey();
        int currentWordFreq = entry.getValue();
        if (currentWordFreq < theta) {
            if (!SharedMem.wildKeys.get("android").containsKey(currentWord)
                    && !SharedMem.wildKeys.get("ios").containsKey(currentWord)
                    && !SharedMem.wildKeys.get("windows").containsKey(currentWord))
                continue;
        }
        Attribute attribute = new Attribute(currentWord);
        attributes.add(attribute);
        fi.put(currentWord, index);
        index++;
    }

    ArrayList<String> classVals = new ArrayList<String>();
    classVals.add("" + LABEL_NEGATIVE);
    classVals.add("" + LABEL_POSITIVE);
    attributes.add(new Attribute("PIILabel", classVals));

    // Populate Data Points
    Iterator<Map<String, Integer>> all = trainMatrix.iterator();
    int count = 0;
    Instances trainingInstances = new Instances("Rel", attributes, 0);
    trainingInstances.setClassIndex(trainingInstances.numAttributes() - 1);
    while (all.hasNext()) {
        Map<String, Integer> dataMap = all.next();
        double[] instanceValue = new double[attributes.size()];
        for (int i = 0; i < attributes.size() - 1; i++) {
            instanceValue[i] = 0;
        }
        int label = PIILabels.get(count);
        instanceValue[attributes.size() - 1] = label;
        for (Map.Entry<String, Integer> entry : dataMap.entrySet()) {
            if (fi.containsKey(entry.getKey())) {
                int i = fi.get(entry.getKey());
                int val = entry.getValue();
                instanceValue[i] = val;
            }
        }
        Instance data = new SparseInstance(1.0, instanceValue);
        trainingInstances.add(data);
        count++;
    }
    // Write into .arff file for persistence
    try {
        BufferedWriter bw = new BufferedWriter(new FileWriter(RConfig.arffFolder + info.domainOS + ".arff"));
        bw.write(trainingInstances.toString());
        bw.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return trainingInstances;
}

From source file:miRdup.WekaModule.java

License:Open Source License

public static void trainModel(File arff, String keyword) {
    dec.setMaximumFractionDigits(3);/*from   w w w  .  ja v  a2 s  .  c o  m*/
    System.out.println("\nTraining model on file " + arff);
    try {
        // load data
        DataSource source = new DataSource(arff.toString());
        Instances data = source.getDataSet();
        if (data.classIndex() == -1) {
            data.setClassIndex(data.numAttributes() - 1);
        }

        PrintWriter pwout = new PrintWriter(new FileWriter(keyword + Main.modelExtension + "Output"));
        PrintWriter pwroc = new PrintWriter(new FileWriter(keyword + Main.modelExtension + "roc.arff"));

        //remove ID row
        Remove rm = new Remove();
        rm.setAttributeIndices("1");
        FilteredClassifier fc = new FilteredClassifier();
        fc.setFilter(rm);

        //            // train model svm
        //            weka.classifiers.functions.LibSVM model = new weka.classifiers.functions.LibSVM();
        //            model.setOptions(weka.core.Utils.splitOptions("-S 0 -K 2 -D 3 -G 0.0 -R 0.0 -N 0.5 -M 40.0 -C 1.0 -E 0.0010 -P 0.1 -B"));
        // train model MultilayerPerceptron
        //            weka.classifiers.functions.MultilayerPerceptron model = new weka.classifiers.functions.MultilayerPerceptron();
        //            model.setOptions(weka.core.Utils.splitOptions("-L 0.3 -M 0.2 -N 500 -V 0 -S 0 -E 20 -H a"));
        // train model Adaboost on RIPPER
        //            weka.classifiers.meta.AdaBoostM1 model = new weka.classifiers.meta.AdaBoostM1();
        //            model.setOptions(weka.core.Utils.splitOptions("weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.rules.JRip -- -F 10 -N 2.0 -O 5 -S 1"));
        // train model Adaboost on FURIA
        //            weka.classifiers.meta.AdaBoostM1 model = new weka.classifiers.meta.AdaBoostM1();
        //            model.setOptions(weka.core.Utils.splitOptions("weka.classifiers.meta.AdaBoostM1 -P 100 -S 1 -I 10 -W weka.classifiers.rules.FURIA -- -F 10 -N 2.0 -O 5 -S 1 -p 0 -s 0"));
        //train model Adaboot on J48 trees
        //             weka.classifiers.meta.AdaBoostM1 model = new weka.classifiers.meta.AdaBoostM1();
        //             model.setOptions(
        //                     weka.core.Utils.splitOptions(
        //                     "-P 100 -S 1 -I 10 -W weka.classifiers.trees.J48 -- -C 0.25 -M 2"));
        //train model Adaboot on Random Forest trees
        weka.classifiers.meta.AdaBoostM1 model = new weka.classifiers.meta.AdaBoostM1();
        model.setOptions(weka.core.Utils
                .splitOptions("-P 100 -S 1 -I 10 -W weka.classifiers.trees.RandomForest -- -I 50 -K 0 -S 1"));

        if (Main.debug) {
            System.out.print("Model options: " + model.getClass().getName().trim() + " ");
        }
        System.out.print(model.getClass() + " ");
        for (String s : model.getOptions()) {
            System.out.print(s + " ");
        }

        pwout.print("Model options: " + model.getClass().getName().trim() + " ");
        for (String s : model.getOptions()) {
            pwout.print(s + " ");
        }

        //build model
        //            model.buildClassifier(data);
        fc.setClassifier(model);
        fc.buildClassifier(data);

        // cross validation 10 times on the model
        Evaluation eval = new Evaluation(data);
        //eval.crossValidateModel(model, data, 10, new Random(1));
        StringBuffer sb = new StringBuffer();
        eval.crossValidateModel(fc, data, 10, new Random(1), sb, new Range("first,last"), false);

        //System.out.println(sb);
        pwout.println(sb);
        pwout.flush();

        // output
        pwout.println("\n" + eval.toSummaryString());
        System.out.println(eval.toSummaryString());

        pwout.println(eval.toClassDetailsString());
        System.out.println(eval.toClassDetailsString());

        //calculate importants values
        String ev[] = eval.toClassDetailsString().split("\n");

        String ptmp[] = ev[3].trim().split(" ");
        String ntmp[] = ev[4].trim().split(" ");
        String avgtmp[] = ev[5].trim().split(" ");

        ArrayList<String> p = new ArrayList<String>();
        ArrayList<String> n = new ArrayList<String>();
        ArrayList<String> avg = new ArrayList<String>();

        for (String s : ptmp) {
            if (!s.trim().isEmpty()) {
                p.add(s);
            }
        }
        for (String s : ntmp) {
            if (!s.trim().isEmpty()) {
                n.add(s);
            }
        }
        for (String s : avgtmp) {
            if (!s.trim().isEmpty()) {
                avg.add(s);
            }
        }

        double tp = Double.parseDouble(p.get(0));
        double fp = Double.parseDouble(p.get(1));
        double tn = Double.parseDouble(n.get(0));
        double fn = Double.parseDouble(n.get(1));
        double auc = Double.parseDouble(avg.get(7));

        pwout.println("\nTP=" + tp + "\nFP=" + fp + "\nTN=" + tn + "\nFN=" + fn);
        System.out.println("\nTP=" + tp + "\nFP=" + fp + "\nTN=" + tn + "\nFN=" + fn);

        //specificity, sensitivity, Mathew's correlation, Prediction accuracy
        double sp = ((tn) / (tn + fp));
        double se = ((tp) / (tp + fn));
        double acc = ((tp + tn) / (tp + tn + fp + fn));
        double mcc = ((tp * tn) - (fp * fn)) / Math.sqrt((tp + fp) * (tn + fn) * (tp + fn) * tn + fp);

        String output = "\nse=" + dec.format(se).replace(",", ".") + "\nsp=" + dec.format(sp).replace(",", ".")
                + "\nACC=" + dec.format(acc).replace(",", ".") + "\nMCC=" + dec.format(mcc).replace(",", ".")
                + "\nAUC=" + dec.format(auc).replace(",", ".");

        pwout.println(output);
        System.out.println(output);

        pwout.println(eval.toMatrixString());
        System.out.println(eval.toMatrixString());

        pwout.flush();
        pwout.close();

        //Saving model
        System.out.println("Model saved: " + keyword + Main.modelExtension);
        weka.core.SerializationHelper.write(keyword + Main.modelExtension, fc.getClassifier() /*model*/);

        // get curve
        ThresholdCurve tc = new ThresholdCurve();
        int classIndex = 0;
        Instances result = tc.getCurve(eval.predictions(), classIndex);
        pwroc.print(result.toString());
        pwroc.flush();
        pwroc.close();

        // draw curve
        //rocCurve(eval);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:miRdup.WekaModule.java

License:Open Source License

public static void rocCurve(Evaluation eval) {
    try {/*  ww w  . j a  v a  2  s.c om*/
        // generate curve
        ThresholdCurve tc = new ThresholdCurve();
        int classIndex = 0;
        Instances result = tc.getCurve(eval.predictions(), classIndex);
        result.toString();
        // plot curve
        ThresholdVisualizePanel vmc = new ThresholdVisualizePanel();
        vmc.setROCString("(Area under ROC = " + Utils.doubleToString(tc.getROCArea(result), 4) + ")");
        vmc.setName(result.relationName());
        PlotData2D tempd = new PlotData2D(result);
        tempd.setPlotName(result.relationName());
        tempd.addInstanceNumberAttribute();
        // specify which points are connected
        boolean[] cp = new boolean[result.numInstances()];
        for (int n = 1; n < cp.length; n++) {
            cp[n] = true;
        }
        tempd.setConnectPoints(cp);
        // add plot
        vmc.addPlot(tempd);

        //
        result.toString();

        // display curve
        String plotName = vmc.getName();
        final javax.swing.JFrame jf = new javax.swing.JFrame("Weka Classifier Visualize: " + plotName);
        jf.setSize(500, 400);
        jf.getContentPane().setLayout(new BorderLayout());
        jf.getContentPane().add(vmc, BorderLayout.CENTER);
        jf.addWindowListener(new java.awt.event.WindowAdapter() {
            public void windowClosing(java.awt.event.WindowEvent e) {
                jf.dispose();
            }
        });

        jf.setVisible(true);
        System.out.println("");
    } catch (Exception e) {
        e.printStackTrace();
    }

}

From source file:moa.tud.ke.patching.AdaptivePatchingAdwin.java

static void writeArff(String filename, Instances data) {
    try {/*from   www  .j  av  a 2s  .c  o  m*/

        BufferedWriter writer = new BufferedWriter(new FileWriter(filename));
        writer.write(data.toString());
        writer.flush();
        writer.close();

    } catch (Exception e) {
        System.err.println("Error writing arff file.");
    }
}

From source file:model.clustering.Clustering.java

public String filledFile(Instances data, int numOfClusters, String remove) throws Exception {

    String mainData = data.toString();
    int index = mainData.indexOf("@data");
    String clusterData = mainData.substring(0, index + 6);

    Remove removeFilter = new Remove();
    removeFilter.setAttributeIndices(remove);

    kMeansCLusterer = new SimpleKMeans();
    kMeansCLusterer.setNumClusters(numOfClusters);

    FilteredClusterer filteredClusterer = new FilteredClusterer();
    filteredClusterer.setClusterer(kMeansCLusterer);
    filteredClusterer.setFilter(removeFilter);
    filteredClusterer.buildClusterer(data);

    Enumeration<Instance> newData = data.enumerateInstances();

    eval = new ClusterEvaluation();
    eval.setClusterer(filteredClusterer);
    eval.evaluateClusterer(data);/*ww w . j  av  a2s . c om*/

    while (newData.hasMoreElements()) {

        Instance i = (Instance) newData.nextElement();
        int kluster = filteredClusterer.clusterInstance(i);
        String instanceString = i.toString() + "," + kluster;
        clusterData = clusterData + instanceString + "\n";

    }
    return clusterData;
}

From source file:mulan.data.ConverterCLUS.java

License:Open Source License

/**
 * Converts the original dataset to mulan compatible dataset.
 *
 * @param sourceFilename the source file name
 * @param arffFilename the converted arff name
 * @param xmlFilename the xml name/*  w  w  w .  j  av  a 2  s  . co  m*/
 * @throws java.lang.Exception
 */
public static void convert(String sourceFilename, String arffFilename, String xmlFilename) throws Exception {
    String line;
    try {
        BufferedReader brInput = new BufferedReader(new FileReader(sourceFilename));

        String relationName = null;
        ArrayList<Attribute> attInfo = new ArrayList<Attribute>();
        Instances data = null;
        int numAttributes = 0;
        String[] labelNames = null;
        while ((line = brInput.readLine()) != null) {
            if (line.startsWith("@RELATION")) {
                relationName = line.replace("@RELATION ", "").replaceAll("'", "").trim();
                continue;
            }
            if (line.startsWith("@ATTRIBUTE ")) {
                String tokens[] = line.split("\\s+");
                Attribute att;
                if (line.startsWith("@ATTRIBUTE class")) {
                    labelNames = tokens[3].split(",");
                    for (int i = 0; i < labelNames.length; i++) {
                        ArrayList<String> labelValues = new ArrayList<String>();
                        labelValues.add("0");
                        labelValues.add("1");
                        att = new Attribute(labelNames[i], labelValues);
                        attInfo.add(att);
                    }
                } else {
                    numAttributes++;
                    if (tokens[2].equals("numeric")) {
                        att = new Attribute(tokens[1]);
                    } else {
                        ArrayList<String> nominalValues = new ArrayList<String>();
                        tokens[2].substring(1, tokens[2].length() - 1);
                        String[] nominalTokens = tokens[2].substring(1, tokens[2].length() - 1).split(",");
                        for (int i = 0; i < nominalTokens.length; i++) {
                            nominalValues.add(nominalTokens[i]);
                        }
                        att = new Attribute(tokens[1], nominalValues);
                    }
                    attInfo.add(att);
                }
                continue;
            }
            if (line.toLowerCase().startsWith("@data")) {
                data = new Instances(relationName, attInfo, 0);
                while ((line = brInput.readLine()) != null) {
                    // fill data
                    String[] tokens = line.split(",");
                    double[] values = new double[attInfo.size()];
                    for (int i = 0; i < numAttributes; i++) {
                        Attribute att = (Attribute) attInfo.get(i);
                        if (att.isNumeric()) {
                            values[i] = Double.parseDouble(tokens[i]);
                        } else {
                            values[i] = att.indexOfValue(tokens[i]);
                        }
                    }
                    String[] labels = tokens[numAttributes].split("@");
                    // fill class values
                    for (int j = 0; j < labels.length; j++) {
                        String[] splitedLabels = labels[j].split("/");
                        String attrName = splitedLabels[0];
                        Attribute att = data.attribute(attrName);
                        values[attInfo.indexOf(att)] = 1;
                        for (int k = 1; k < splitedLabels.length; k++) {
                            attrName = attrName + "/" + splitedLabels[k];
                            att = data.attribute(attrName);
                            values[attInfo.indexOf(att)] = 1;
                        }
                    }
                    Instance instance = new DenseInstance(1, values);
                    data.add(instance);
                }
            }
        }
        BufferedWriter writer;
        writer = new BufferedWriter(new FileWriter(arffFilename));
        writer.write(data.toString());
        writer.close();

        // write xml file
        writer = new BufferedWriter(new FileWriter(xmlFilename));
        writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n");
        writer.write("<labels xmlns=\"http://mulan.sourceforge.net/labels\">\n");
        writer.write("<label name=\"" + labelNames[0] + "\">");
        int depth = 0;
        for (int i = 1; i < labelNames.length; i++) {
            int difSlashes = countSlashes(labelNames[i]) - countSlashes(labelNames[i - 1]);
            // child
            if (difSlashes == 1) {
                depth++;
                writer.write("\n");
                for (int j = 0; j < depth; j++) {
                    writer.write("\t");
                }
                writer.write("<label name=\"" + labelNames[i] + "\">");
            }
            // sibling
            if (difSlashes == 0) {
                writer.write("</label>\n");
                for (int j = 0; j < depth; j++) {
                    writer.write("\t");
                }
                writer.write("<label name=\"" + labelNames[i] + "\">");
            }
            // ancestor
            if (difSlashes < 0) {
                writer.write("</label>\n");
                for (int j = 0; j < Math.abs(difSlashes); j++) {
                    depth--;
                    for (int k = 0; k < depth; k++) {
                        writer.write("\t");
                    }
                    writer.write("</label>\n");
                }
                for (int j = 0; j < depth; j++) {
                    writer.write("\t");
                }
                writer.write("<label name=\"" + labelNames[i] + "\">");
            }
        }
        writer.write("</label>\n");
        while (depth > 0) {
            for (int k = 0; k < depth; k++) {
                writer.write("\t");
            }
            writer.write("</label>\n");
            depth--;
        }
        writer.write("</labels>");
        writer.close();

    } catch (IOException ioEx) {
        ioEx.printStackTrace();
    }
}