mulan.data.ConverterLibSVM.java Source code

Java tutorial

Introduction

Here is the source code for mulan.data.ConverterLibSVM.java

Source

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */

/*
 *    ConverterLibSVM.java
 *    Copyright (C) 2009-2010 Aristotle University of Thessaloniki, Thessaloniki, Greece
 */
package mulan.data;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.StringTokenizer;

import weka.core.Attribute;
import weka.core.DenseInstance;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.SparseInstance;
import weka.core.Utils;

/**
 * Class that converts LibSVM multi-label data sets to Mulan compatible format <p>
 *
 * @author Grigorios Tsoumakas
 * @version $Revision: 0.01 $
 */
public class ConverterLibSVM {

    /**
     * Converts a multi-label dataset from LibSVM format to the format
     * that is compatible with Mulan. It constructs one ARFF and one XML file. 
     *
     * @param path the directory that contains the source file and will contain 
     * the target files
     * @param sourceFilename the name of the source file
     * @param relationName the relation name of the arff file that will be 
     * constructed
     * @param targetFilestem the filestem for the target files (.arff and .xml)
     */
    public static void convertFromLibSVM(String path, String sourceFilename, String targetFilestem,
            String relationName) {
        BufferedReader aReader = null;
        BufferedWriter aWriter = null;

        int numLabels = 0;
        int numAttributes = 0;
        int numInstances = 0;
        double meanParsedAttributes = 0;

        // Calculate number of labels and attributes

        String Line = null;
        try {
            aReader = new BufferedReader(new FileReader(path + sourceFilename));

            while ((Line = aReader.readLine()) != null) {
                numInstances++;

                StringTokenizer strTok = new StringTokenizer(Line, " ");
                while (strTok.hasMoreTokens()) {
                    String token = strTok.nextToken();

                    if (token.indexOf(":") == -1) {
                        // parse label info
                        StringTokenizer labelTok = new StringTokenizer(token, ",");
                        while (labelTok.hasMoreTokens()) {
                            String strLabel = labelTok.nextToken();
                            int intLabel = Integer.parseInt(strLabel);
                            if (intLabel > numLabels) {
                                numLabels = intLabel;
                            }
                        }
                    } else {
                        // parse attribute info
                        meanParsedAttributes++;
                        StringTokenizer attrTok = new StringTokenizer(token, ":");
                        String strAttrIndex = attrTok.nextToken();
                        int intAttrIndex = Integer.parseInt(strAttrIndex);
                        if (intAttrIndex > numAttributes) {
                            numAttributes = intAttrIndex;
                        }
                    }
                }
            }

            numLabels++;

            System.out.println("Number of attributes: " + numAttributes);
            System.out.println("Number of instances: " + numInstances);
            System.out.println("Number of classes: " + numLabels);

            System.out.println("Constructing XML file... ");
            LabelsMetaDataImpl meta = new LabelsMetaDataImpl();
            for (int label = 0; label < numLabels; label++) {
                meta.addRootNode(new LabelNodeImpl("Label" + (label + 1)));
            }

            String labelsFilePath = path + targetFilestem + ".xml";
            try {
                LabelsBuilder.dumpLabels(meta, labelsFilePath);
                System.out.println("Done!");
            } catch (LabelsBuilderException e) {
                File labelsFile = new File(labelsFilePath);
                if (labelsFile.exists()) {
                    labelsFile.delete();
                }
                System.out.println("Construction of labels XML failed!");
            }

            meanParsedAttributes /= numInstances;
            boolean Sparse = false;
            if (meanParsedAttributes < numAttributes) {
                Sparse = true;
                System.out.println("Dataset is sparse.");
            }

            // Define Instances class to hold data
            ArrayList<Attribute> attInfo = new ArrayList<Attribute>(numAttributes + numLabels);
            Attribute[] att = new Attribute[numAttributes + numLabels];

            for (int i = 0; i < numAttributes; i++) {
                att[i] = new Attribute("Att" + (i + 1));
                attInfo.add(att[i]);
            }
            ArrayList<String> ClassValues = new ArrayList<String>(2);
            ClassValues.add("0");
            ClassValues.add("1");
            for (int i = 0; i < numLabels; i++) {
                att[numAttributes + i] = new Attribute("Label" + (i + 1), ClassValues);
                attInfo.add(att[numAttributes + i]);
            }

            // Re-read file and convert into multi-label arff
            int countInstances = 0;

            aWriter = new BufferedWriter(new FileWriter(path + targetFilestem + ".arff"));
            Instances data = new Instances(relationName, attInfo, 0);
            aWriter.write(data.toString());

            aReader = new BufferedReader(new FileReader(path + sourceFilename));

            while ((Line = aReader.readLine()) != null) {
                countInstances++;

                // set all  values to 0
                double[] attValues = new double[numAttributes + numLabels];
                Arrays.fill(attValues, 0);

                Instance tempInstance = new DenseInstance(1, attValues);
                tempInstance.setDataset(data);

                // separate class info from attribute info
                // ensure class info exists
                StringTokenizer strTok = new StringTokenizer(Line, " ");

                while (strTok.hasMoreTokens()) {
                    String token = strTok.nextToken();

                    if (token.indexOf(":") == -1) {
                        // parse label info
                        StringTokenizer labelTok = new StringTokenizer(token, ",");
                        while (labelTok.hasMoreTokens()) {
                            String strLabel = labelTok.nextToken();
                            int intLabel = Integer.parseInt(strLabel);
                            tempInstance.setValue(numAttributes + intLabel, 1);
                        }
                    } else {
                        // parse attribute info
                        StringTokenizer AttrTok = new StringTokenizer(token, ":");
                        String strAttrIndex = AttrTok.nextToken();
                        String strAttrValue = AttrTok.nextToken();
                        tempInstance.setValue(Integer.parseInt(strAttrIndex) - 1, Double.parseDouble(strAttrValue));
                    }
                }

                if (Sparse) {
                    SparseInstance tempSparseInstance = new SparseInstance(tempInstance);
                    aWriter.write(tempSparseInstance.toString() + "\n");
                } else {
                    aWriter.write(tempInstance.toString() + "\n");
                }

            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                if (aReader != null) {
                    aReader.close();
                }
                if (aWriter != null) {
                    aWriter.close();
                }
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }
    }

    private static void createLabelsMetadataFile(String filePath, int numLabels) throws LabelsBuilderException {
    }

    /**
     * Command line interface for the converter
     *
     * @param args command line arguments
     */
    public static void main(String[] args) {
        String path = null;
        String source = null;
        String target = null;
        String relationName = "LibSVM";
        try {
            path = Utils.getOption("path", args);
            source = Utils.getOption("source", args);
            target = Utils.getOption("target", args);
            relationName = Utils.getOption("name", args);
            ConverterLibSVM.convertFromLibSVM(path, source, target, relationName);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}