Example usage for weka.core Instances numInstances

Introduction

In this page you can find the example usage for weka.core Instances numInstances.

Prototype


publicint numInstances()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:eksploracja.Eksploracja.java

/**
 * @param args the command line arguments
 *//* w w  w . j a  va 2 s . com*/
public static void main(String[] args) throws Exception {
    // TODO code application logic here
    //sout +tabualcja

    System.out.println("Hello world - tu eksploracja");

    //Pobieranie danych
    String filename = "C:\\Program Files\\Weka-3-8\\data\\weather.numeric.arff";
    DataSource source = new DataSource(filename);
    Instances mojeDane = source.getDataSet();

    //Wywietlanie danych
    System.out.println("Dane: ");
    // System.out.println(mojeDane); //cao danych

    Instance wiersz0 = mojeDane.firstInstance();

    System.out.println("Pocztek " + mojeDane.firstInstance()); //pierwszy wiersz
    System.out.println("Koniec " + mojeDane.lastInstance()); //ostatni wiersz

    System.out.println("\nLiczba danych: " + mojeDane.numInstances());
    System.out.println("\nAtrybuty w liczbie: " + mojeDane.numAttributes());

    for (int i = 0; i < mojeDane.numAttributes(); i++) {
        System.out.println(i + ". " + mojeDane.attribute(i));
        Attribute atr = mojeDane.attribute(i);
        System.out.println(i + " " + atr.name());

        if (atr.isNominal()) {
            System.out.println("Typ danych nominalne");
        } else {
            System.out.println("Typ danych numeryczne");
        }
    }

    //Zapisywanie danych w posataci liczbowej
    System.out.println("Dane - jako liczby: ");
    System.out.println(Arrays.toString(wiersz0.toDoubleArray()));

}

From source file:elh.eus.absa.CLI.java

License:Open Source License

/**
 * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier
 * (E and A) for E#A aspect categories/* ww  w. j  a  v  a 2 s .  co  m*/
 * @throws Exception 
 */
public final void trainATC2(final InputStream inputStream) throws IOException {
    // load training parameters file
    String paramFile = parsedArguments.getString("params");
    String testFile = parsedArguments.getString("testset");
    String paramFile2 = parsedArguments.getString("params2");
    String corpusFormat = parsedArguments.getString("corpusFormat");
    //String validation = parsedArguments.getString("validation");
    String lang = parsedArguments.getString("language");
    //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum"));
    //boolean printPreds = parsedArguments.getBoolean("printPreds");
    boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences");
    boolean onlyTest = parsedArguments.getBoolean("testOnly");
    double threshold = 0.5;
    double threshold2 = 0.5;
    String modelsPath = "/home/inaki/elixa-atp/ovsaModels";

    CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang);
    Features atcTrain = new Features(reader, paramFile, "3");
    Instances traindata = atcTrain.loadInstances(true, "atc");

    if (onlyTest) {
        if (FileUtilsElh.checkFile(testFile)) {
            System.err.println("read from test file");
            reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat,
                    nullSentenceOpinions, lang);
            atcTrain.setCorpus(reader);
            traindata = atcTrain.loadInstances(true, "atc");
        }
    }

    //setting class attribute (entCat|attCat|entAttCat|polarityCat)

    //HashMap<String, Integer> opInst = atcTrain.getOpinInst();      
    //WekaWrapper classifyAtts;
    WekaWrapper onevsall;
    try {

        //classify.printMultilabelPredictions(classify.multiLabelPrediction());      */   

        //onevsall
        Instances entdata = new Instances(traindata);
        entdata.deleteAttributeAt(entdata.attribute("attCat").index());
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        entdata.setClassIndex(entdata.attribute("entCat").index());
        onevsall = new WekaWrapper(entdata, true);

        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "entCat");
            System.out.println("trainATC: one vs all models ready");
        }
        onevsall.setTestdata(entdata);
        HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath,
                paramFile + "entCat");
        System.out.println("trainATC: one vs all predictions ready");
        HashMap<Integer, String> instOps = new HashMap<Integer, String>();
        for (String oId : atcTrain.getOpinInst().keySet()) {
            instOps.put(atcTrain.getOpinInst().get(oId), oId);
        }

        atcTrain = new Features(reader, paramFile2, "3");
        entdata = atcTrain.loadInstances(true, "attTrain2_data");
        entdata.deleteAttributeAt(entdata.attribute("entAttCat").index());
        //entdata.setClassIndex(entdata.attribute("entCat").index());

        Attribute insAtt = entdata.attribute("instanceId");
        double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1);
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins" + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold) {
                    //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                    // for the first one update the instances
                    if (oSubId >= 1) {
                        Instance newIns = new SparseInstance(currentInst);
                        newIns.setDataset(entdata);
                        entdata.add(newIns);
                        newIns.setValue(insAtt, maxInstId + oSubId);
                        newIns.setClassValue(cl);
                        instOps.put((int) maxInstId + oSubId, oId);

                    }
                    // if the are more create new instances
                    else {
                        currentInst.setClassValue(cl);
                        //create and add opinion to the structure
                        //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                        //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId);
                        //reader.addOpinion(op);
                    }
                    oSubId++;
                }
            } //finished updating instances data                                    
        }

        entdata.setClass(entdata.attribute("attCat"));
        onevsall = new WekaWrapper(entdata, true);

        /**
         *  Bigarren sailkatzailea
         * 
         * */
        if (!onlyTest) {
            onevsall.trainOneVsAll(modelsPath, paramFile + "attCat");
            System.out.println("trainATC: one vs all attcat models ready");
        }

        ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat");

        insAtt = entdata.attribute("instanceId");
        maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues());
        System.err.println("last instance has index: " + maxInstId);
        for (int ins = 0; ins < entdata.numInstances(); ins++) {
            System.err.println("ins: " + ins);
            int i = (int) entdata.instance(ins).value(insAtt);
            Instance currentInst = entdata.instance(ins);
            //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i));
            String sId = reader.getOpinion(instOps.get(i)).getsId();
            String oId = instOps.get(i);
            reader.removeSentenceOpinions(sId);
            int oSubId = 0;
            for (String cl : ovsaRes.get(i).keySet()) {
                //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                if (ovsaRes.get(i).get(cl) > threshold2) {
                    ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));
                    if (ovsaRes.get(i).get(cl) > threshold) {
                        //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl));                  
                        // for the first one update the instances
                        if (oSubId >= 1) {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);                     
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        // if the are more create new instances
                        else {
                            String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl;
                            //create and add opinion to the structure
                            //   trgt, offsetFrom, offsetTo, polarity, cat, sId);
                            reader.removeOpinion(oId);
                            Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId);
                            reader.addOpinion(op);
                        }
                        oSubId++;
                    }
                } //finished updating instances data                                    
            }
        }
        reader.print2Semeval2015format(paramFile + "entAttCat.xml");
    } catch (Exception e) {
        e.printStackTrace();
    }

    //traindata.setClass(traindata.attribute("entAttCat"));
    System.err.println("DONE CLI train-atc2 (oneVsAll)");
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //from   w ww. ja v  a 2s. c om
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstances(boolean save, String prefix) throws IOException {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //Properties posProp = new Properties();
    //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);      
    if (params.containsKey("lemmaNgrams")) {
        Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"),
                params.getProperty("lemma-model"), corpus.getLang(), "bin", "false");

        postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp);
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        // string normalization (emoticons, twitter grammar,...)
        String opNormalized = corpus.getOpinionSentence(oId);

        // compute uppercase ratio before normalization (if needed)      
        double upRatio = 0.0;
        if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) {
            String upper = opNormalized.replaceAll("[\\p{Ll}]", "");
            upRatio = (double) upper.length() / (double) opNormalized.length();
            values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        }

        // string normalization (emoticons, twitter grammar,...)
        if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams"))
                && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) {
            opNormalized = normalize(opNormalized, params.getProperty("normalization", "none"));
        }

        //process the current instance with the NLP pipeline in order to get token and lemma|pos features
        KAFDocument nafinst = new KAFDocument("", "");
        String nafname = trainExamples.get(oId).getsId().replace(':', '_');
        String nafDir = params.getProperty("kafDir");
        String nafPath = nafDir + File.separator + nafname + ".kaf";
        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = 1;
        try {
            if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty()))
            {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(),
                            params.getProperty("pos-model"), params.getProperty("lemma-model"), postagger);
                    Files.createDirectories(Paths.get(nafDir));
                    nafinst.save(nafPath);
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));
            } else {
                if (FileUtilsElh.checkFile(nafPath)) {
                    nafinst = KAFDocument.createFromFile(new File(nafPath));
                } else {
                    nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang());
                }
                tokNum = nafinst.getWFs().size();
                //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId));

            }
        } catch (IOException | JDOMException e) {
            System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId
                    + "|" + oId + ") for filling the attribute vector");
            e.printStackTrace();
            System.exit(5);
        }

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        List<WF> window = nafinst.getWFs();
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer to = window.size();
            Integer from = 0;
            end++;
            for (int i = 0; i < window.size(); i++) {
                WF wf = window.get(i);
                if ((wf.getOffset() == start) && (i >= bowWin)) {
                    from = i - bowWin;
                } else if (wf.getOffset() >= end) {
                    if (i + bowWin < window.size()) {
                        to = i + bowWin;
                    }
                    break;
                }
            }
            window = window.subList(from, to);
            //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to);
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        List<String> windowWFIds = new ArrayList<String>();

        // word form ngram related features
        for (WF wf : window) {
            windowWFIds.add(wf.getId());

            String wfStr = wf.getForm();
            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum

            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (Term t : nafinst.getTermsFromWFs(windowWFIds)) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    String lemma = t.getLemma();

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // add ngrams to the feature vector
                    for (int i = 0; i < ngrams.size(); i++) {
                        String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma");
                        //if the current lemma is in the ngram list activate the feature in the vector
                        if (params.containsKey("lemmaNgrams")
                                && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                            Attribute ngAtt = rsltdata.attribute(ng);
                            if (ngAtt != null) {
                                addNumericToFeatureVector(ng, values, 1); //tokNum                     
                            }
                        }

                        ng = featureFromArray(ngrams.subList(0, i + 1), "");
                        if (params.containsKey("polarLexiconGeneral")
                                || params.containsKey("polarLexiconDomain")) {
                            checkPolarityLexicons(ng, values, tokNum, polNgrams);
                        } //end polarity ngram checker
                    } //end ngram checking                                      
                }
                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }
                    posNgrams.add(t.getPos());

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            while (!ngrams.isEmpty()) {
                String ng = featureFromArray(ngrams, "lemma");

                //if the current lemma is in the ngram list activate the feature in the vector
                if (rsltdata.attribute(ng) != null) {
                    addNumericToFeatureVector(ng, values, 1); //tokNum
                }

                // polarity lexicons
                if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
                    checkPolarityLexicons(ng, values, tokNum, polNgrams);
                } //end polarity ngram checker

                ngrams.removeFirst();
            }

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId));
            if (pol != null && !pol.isEmpty()) {
                //System.err.println("polarity: _"+pol+"_");
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstances() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }
    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * //w w  w  .  ja  va2 s.c  o m
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesTAB(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesTAB() - featNum: " + this.featNum + " - trainset attrib num -> "
            + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String[] noWindow = corpus.getOpinionSentence(oId).split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\t");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\t");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesTAB() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:elh.eus.absa.Features.java

License:Open Source License

/**
 *   Function fills the attribute vectors for the instances existing in the Conll tabulated formatted corpus given. 
 *   Attribute vectors contain the features loaded by the creatFeatureSet() function.
 * /*from  w w w . ja  v a  2  s  . com*/
 * @param boolean save : whether the Instances file should be saved to an arff file or not.
 * @return Weka Instances object containing the attribute vectors filled with the features specified
 *          in the parameter file.
 */
public Instances loadInstancesConll(boolean save, String prefix) {
    String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_"
            + prefix;
    HashMap<String, Opinion> trainExamples = corpus.getOpinions();

    String nafdir = params.getProperty("kafDir");
    int trainExamplesNum = trainExamples.size();

    int bowWin = 0;
    if (params.containsKey("window")) {
        bowWin = Integer.parseInt(params.getProperty("window"));
        savePath = savePath + "_w" + bowWin;
    }

    //System.out.println("train examples: "+trainExamplesNum);
    //Create the Weka object for the training set
    Instances rsltdata = new Instances("train", atts, trainExamplesNum);

    // setting class attribute (last attribute in train data.
    //traindata.setClassIndex(traindata.numAttributes() - 1);

    System.err.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");
    System.out.println("Features: loadInstancesConll() - featNum: " + this.featNum
            + " - trainset attrib num -> " + rsltdata.numAttributes() + " - ");

    int instId = 1;
    // fill the vectors for each training example
    for (String oId : trainExamples.keySet()) {
        //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId()));

        //value vector
        double[] values = new double[featNum];

        // first element is the instanceId         
        values[rsltdata.attribute("instanceId").index()] = instId;

        LinkedList<String> ngrams = new LinkedList<String>();
        int ngramDim;
        try {
            ngramDim = Integer.valueOf(params.getProperty("wfngrams"));
        } catch (Exception e) {
            ngramDim = 0;
        }

        boolean polNgrams = false;
        if (params.containsKey("polNgrams")) {
            polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes");
        }

        String nafPath = nafdir + File.separator + trainExamples.get(oId).getsId().replace(':', '_');
        String taggedFile = "";
        try {
            if (!FileUtilsElh.checkFile(nafPath + ".kaf")) {
                nafPath = NLPpipelineWrapper.tagSentence(corpus.getOpinionSentence(oId), nafPath,
                        corpus.getLang(), params.getProperty("pos-model"), params.getProperty("lemma-model"),
                        postagger);
            } else {
                nafPath = nafPath + ".kaf";
            }
            InputStream reader = new FileInputStream(new File(nafPath));
            taggedFile = IOUtils.toString(reader);
            reader.close();
        } catch (IOException | JDOMException fe) {
            // TODO Auto-generated catch block
            fe.printStackTrace();
        }

        String[] noWindow = taggedFile.split("\n");

        //counter for opinion sentence token number. Used for computing relative values of the features
        int tokNum = noWindow.length;

        //System.err.println("Features::loadInstancesConll - tagged File read lines:"+tokNum);

        List<String> window = Arrays.asList(noWindow);
        Integer end = corpus.getOpinion(oId).getTo();
        // apply window if window active (>0) and if the target is not null (to=0)
        if ((bowWin > 0) && (end > 0)) {
            Integer start = corpus.getOpinion(oId).getFrom();
            Integer from = start - bowWin;
            if (from < 0) {
                from = 0;
            }
            Integer to = end + bowWin;
            if (to > noWindow.length - 1) {
                to = noWindow.length - 1;
            }
            window = Arrays.asList(Arrays.copyOfRange(noWindow, from, to));
        }

        //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+
        //      "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n");

        //System.err.println(Arrays.toString(window.toArray()));

        // word form ngram related features
        for (String wf : window) {
            String[] fields = wf.split("\\s");
            String wfStr = normalize(fields[0], params.getProperty("normalization", "none"));
            // blank line means we found a sentence end. Empty n-gram list and reiniciate.  
            if (wf.equals("")) {
                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, true); //toknum

                // since wf is empty no need to check for clusters and other features.
                continue;
            }

            if (params.containsKey("wfngrams") && ngramDim > 0) {
                if (!savePath.contains("_wf" + ngramDim)) {
                    savePath = savePath + "_wf" + ngramDim;
                }
                //if the current word form is in the ngram list activate the feature in the vector
                if (ngrams.size() >= ngramDim) {
                    ngrams.removeFirst();
                }
                ngrams.add(wfStr);

                // add ngrams to the feature vector
                checkNgramFeatures(ngrams, values, "", 1, false); //toknum
            }
            // Clark cluster info corresponding to the current word form
            if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) {
                if (!savePath.contains("_cl")) {
                    savePath = savePath + "_cl";
                }
                values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) {
                if (!savePath.contains("_br")) {
                    savePath = savePath + "_br";
                }
                values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++;
            }

            // Clark cluster info corresponding to the current word form
            if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) {
                if (!savePath.contains("_w2v")) {
                    savePath = savePath + "_w2v";
                }
                values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++;
            }

        }

        //empty ngram list and add remaining ngrams to the feature list
        checkNgramFeatures(ngrams, values, "", 1, true); //toknum

        // PoS tagger related attributes: lemmas and pos tags
        if (params.containsKey("lemmaNgrams")
                || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0"))
                || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) {
            ngrams = new LinkedList<String>();
            if (params.containsKey("lemmaNgrams")
                    && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) {
                ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams"));
            } else {
                ngramDim = 3;
            }
            LinkedList<String> posNgrams = new LinkedList<String>();
            int posNgramDim = 0;
            if (params.containsKey("pos")) {
                posNgramDim = Integer.valueOf(params.getProperty("pos"));
            }

            for (String t : window) {
                //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))
                if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral")
                        || params.containsKey("polarLexiconDomain")) {
                    if (!savePath.contains("_l" + ngramDim)) {
                        savePath = savePath + "_l" + ngramDim;
                    }

                    //blank line means we found a sentence end. Empty n-gram list and reiniciate.
                    if (t.equals("")) {
                        // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                        checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, true, polNgrams); //toknum

                        // since t is empty no need to check for clusters and other features.
                        continue;
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 2) {
                        continue;
                    }
                    String lemma = normalize(fields[1], params.getProperty("normalization", "none"));

                    if (ngrams.size() >= ngramDim) {
                        ngrams.removeFirst();
                    }
                    ngrams.add(lemma);

                    // check both lemma n-grams and polarity lexicons, and add values to the feature vector
                    checkNgramsAndPolarLexicons(ngrams, values, "lemma", 1, tokNum, false, polNgrams);

                }

                //pos tags
                if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) {
                    if (!savePath.contains("_p")) {
                        savePath = savePath + "_p";
                    }

                    if (posNgrams.size() >= posNgramDim) {
                        posNgrams.removeFirst();
                    }

                    String[] fields = t.split("\\s");
                    if (fields.length < 3) {
                        continue;
                    }
                    String pos = fields[2];

                    posNgrams.add(pos);

                    // add ngrams to the feature vector
                    checkNgramFeatures(posNgrams, values, "pos", 1, false);
                }
            } //endFor

            //empty ngram list and add remaining ngrams to the feature list
            // check both lemma n-grams and polarity lexicons, and add values to the feature vector
            checkNgramsAndPolarLexicons(ngrams, values, "", 1, tokNum, true, polNgrams);

            //empty pos ngram list and add remaining pos ngrams to the feature list
            checkNgramFeatures(posNgrams, values, "pos", 1, true);

        }

        // add sentence length as a feature
        if (params.containsKey("sentenceLength")
                && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) {
            values[rsltdata.attribute("sentenceLength").index()] = tokNum;
        }

        // compute uppercase ratio before normalization (if needed)      
        //double upRatio =0.0;
        //if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes"))
        //{
        //   String upper = opNormalized.replaceAll("[a-z]", "");
        //   upRatio = (double)upper.length() / (double)opNormalized.length();
        //   values[rsltdata.attribute("upperCaseRation").index()] = upRatio;
        //}

        //create object for the current instance and associate it with the current train dataset.         
        Instance inst = new SparseInstance(1.0, values);
        inst.setDataset(rsltdata);

        // add category attributte values
        String cat = trainExamples.get(oId).getCategory();

        if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) {
            if (cat.compareTo("NULL") == 0) {
                inst.setValue(rsltdata.attribute("entCat").index(), cat);
                inst.setValue(rsltdata.attribute("attCat").index(), cat);
            } else {
                String[] splitCat = cat.split("#");
                inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]);
                inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]);
            }

            //inst.setValue(attIndexes.get("entAttCat"), cat);
        } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) {
            inst.setValue(rsltdata.attribute("entAttCat").index(), cat);
        }

        if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) {
            // add class value as a double (Weka stores all values as doubles )
            String pol = normalizePolarity(trainExamples.get(oId).getPolarity());
            if (pol != null && !pol.isEmpty()) {
                inst.setValue(rsltdata.attribute("polarityCat"), pol);
            } else {
                //System.err.println("polarity: _"+pol+"_");
                inst.setMissing(rsltdata.attribute("polarityCat"));
            }
        }

        //add instance to train data
        rsltdata.add(inst);

        //store opinion Id and instance Id
        this.opInst.put(oId, instId);
        instId++;
    }

    System.err.println("Features : loadInstancesConll() - training data ready total number of examples -> "
            + trainExamplesNum + " - " + rsltdata.numInstances());

    if (save) {
        try {
            savePath = savePath + ".arff";
            System.err.println("arff written to: " + savePath);
            ArffSaver saver = new ArffSaver();

            saver.setInstances(rsltdata);

            saver.setFile(new File(savePath));
            saver.writeBatch();
        } catch (IOException e1) {
            e1.printStackTrace();
        } catch (Exception e2) {
            e2.printStackTrace();
        }
    }

    return rsltdata;
}

From source file:elh.eus.absa.WekaWrapper.java

License:Open Source License

/**
 *      Train one vs all models over the given training data.
 *  //from ww  w  .j  av  a  2 s. co  m
 * @param modelpath directory to store each model for the one vs. all method
 * @param prefix prefix the models should have (each model will have the name of its class appended
 * @throws Exception
 */
public void trainOneVsAll(String modelpath, String prefix) throws Exception {
    Instances orig = new Instances(traindata);
    Enumeration<Object> classValues = traindata.classAttribute().enumerateValues();
    String classAtt = traindata.classAttribute().name();
    while (classValues.hasMoreElements()) {
        String v = (String) classValues.nextElement();
        System.err.println("trainer onevsall for class " + v + " classifier");
        //needed because of weka's sparse data format problems THIS IS TROUBLE! ...
        if (v.equalsIgnoreCase("dummy")) {
            continue;
        }
        // copy instances and set the same class value
        Instances ovsa = new Instances(orig);
        //create a new class attribute         
        //   // Declare the class attribute along with its values
        ArrayList<String> classVal = new ArrayList<String>();
        classVal.add("dummy"); //needed because of weka's sparse data format problems...
        classVal.add(v);
        classVal.add("UNKNOWN");
        ovsa.insertAttributeAt(new Attribute(classAtt + "2", classVal), ovsa.numAttributes());
        //change all instance labels that have not the current class value to "other"
        for (int i = 0; i < ovsa.numInstances(); i++) {
            Instance inst = ovsa.instance(i);
            String instClass = inst.stringValue(ovsa.attribute(classAtt).index());
            if (instClass.equalsIgnoreCase(v)) {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), v);
            } else {
                inst.setValue(ovsa.attribute(classAtt + "2").index(), "UNKNOWN");
            }
        }
        //delete the old class attribute and set the new.         
        ovsa.setClassIndex(ovsa.attribute(classAtt + "2").index());
        ovsa.deleteAttributeAt(ovsa.attribute(classAtt).index());
        ovsa.renameAttribute(ovsa.attribute(classAtt + "2").index(), classAtt);
        ovsa.setClassIndex(ovsa.attribute(classAtt).index());

        //build the classifier, crossvalidate and store the model
        setTraindata(ovsa);
        saveModel(modelpath + File.separator + prefix + "_" + v + ".model");
        setTestdata(ovsa);
        testModel(modelpath + File.separator + prefix + "_" + v + ".model");

        System.err.println("trained onevsall " + v + " classifier");
    }

    setTraindata(orig);
}

From source file:entities.ArffFile.java

/**
 * Dada una lista de parametros, se ejecuta el filtro de microagregacion.
 * Todos estos parametros son entrada del usuario.
 * @param df Puede ser Euclidian o Manhattan distance, se especifica en la entrada.
 * @param numCluster/*from w  w w . j av a 2 s  .  c o  m*/
 * @param seed
 * @param maxIterations
 * @param replaceMissingValues
 * @param preserveInstancesOrder
 * @param attributes lista de los atributos que se desean generalizar con cluster
 */
public void microAgregacion(DistanceFunction df, int numCluster, int seed, int maxIterations,
        boolean replaceMissingValues, boolean preserveInstancesOrder, List<Integer> attributes)
        throws Exception {
    //instancesFilter = new Instances(instances);
    SimpleKMeans kMeans;
    kMeans = new SimpleKMeans();
    Instances uniqueAttributes;
    uniqueAttributes = new Instances(instancesFilter);
    List<String> names = new ArrayList<>();
    int i = 0;
    for (Integer attribute : attributes) {
        String name = new String(instancesFilter.attribute(attribute).name());
        if (instancesFilter.attribute(attribute).isDate() || instancesFilter.attribute(attribute).isString())
            throw new Exception("No se puede hacer cluster con atributos de tipo DATE o STRING");
        names.add(name);
    }
    while (uniqueAttributes.numAttributes() != attributes.size()) {
        if (!names.contains(uniqueAttributes.attribute(i).name()))
            uniqueAttributes.deleteAttributeAt(i);
        else
            i++;
    }
    try {
        kMeans.setNumClusters(numCluster);
        kMeans.setMaxIterations(maxIterations);
        kMeans.setSeed(seed);
        kMeans.setDisplayStdDevs(false);
        kMeans.setDistanceFunction(df);
        kMeans.setDontReplaceMissingValues(replaceMissingValues);
        kMeans.setPreserveInstancesOrder(preserveInstancesOrder);
        kMeans.buildClusterer(uniqueAttributes);
        //System.out.println(kMeans);
        for (int j = 0; j < uniqueAttributes.numInstances(); j++) {
            int cluster = kMeans.clusterInstance(uniqueAttributes.instance(j));
            for (int k = 0; k < uniqueAttributes.numAttributes(); k++) {
                if (uniqueAttributes.attribute(k).isNumeric())
                    uniqueAttributes.instance(j).setValue(k,
                            Double.parseDouble(kMeans.getClusterCentroids().instance(cluster).toString(k)));
                else
                    uniqueAttributes.instance(j).setValue(k,
                            kMeans.getClusterCentroids().instance(cluster).toString(k));
            }
        }
        replaceValues(uniqueAttributes, attributes);
    } catch (Exception ex) {
        Logger.getLogger(ArffFile.class.getName()).log(Level.SEVERE, null, ex);
    }
    //saveToFile("4");
}

From source file:entity.DifficultyResamplingManager.java

License:Open Source License

/**
 * Return max dimensions of subdataset for a PR (total, p, n)
 * @param originalDataset/*w w  w .j  a  v a2  s. c o m*/
 * @param positiveExamplePercentProportion
 * @return
 */
public SubdatasetDimensions calculateSubdatasetDimensionsForProportion(Instances originalDataset,
        BigDecimal positiveExamplePercentProportion) {

    // size of subdataset, initialized to original size
    int total = originalDataset.numInstances();
    // number of positive instances
    int p = 0;
    // number of negative instances
    int n = 0;
    // current PR
    int pp = 0;

    // count positives
    for (int i = 0; i < total; i++) {
        if (originalDataset.instance(i).stringValue(originalDataset.classIndex()).equals(Settings.buggyLabel)) {
            p++;
        }
    }

    n = total - p;

    // finds actual PR
    pp = calculatePositivePercentCeil(p + n, p);

    if (verbose)
        System.out.println(
                "[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] attuale: p=" + p
                        + " n=" + n + " pp = " + pp);

    // if current PR equals desired one, return current dimensions
    if (pp == positiveExamplePercentProportion.intValue())
        return new SubdatasetDimensions(p, n);

    // if current PR is greater than the desired one
    // decrements p until ceiling of current PR is greater than the desired one
    if (pp > positiveExamplePercentProportion.intValue()) {
        while (pp > positiveExamplePercentProportion.intValue()) {
            p--;
            pp = calculatePositivePercentCeil(p + n, p);
            if (verbose)
                System.out
                        .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] p="
                                + p + " n=" + n + " pp = " + pp);
        }
        // goes back if the previous PR was "nearer" to the desired than the current one
        if (isPPPNearerThanPPToDesiredPercent(calculatePositivePercentCeil(p + 1 + n, p + 1), pp,
                positiveExamplePercentProportion.intValue())) {
            p++;
            pp = calculatePositivePercentCeil(p + n, p);
        }
    }

    // if current PR is less than the desired one
    // decrements n until ceiling of current PR is less than the desired one
    if (pp < positiveExamplePercentProportion.intValue()) {
        while (pp < positiveExamplePercentProportion.intValue()) {
            n--;
            pp = calculatePositivePercentCeil(p + n, p);
            if (verbose)
                System.out
                        .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] p="
                                + p + " n=" + n + " pp = " + pp);
        }
        // goes back if the previous PR was "nearer" to the desired than the current one
        if (isPPPNearerThanPPToDesiredPercent(calculatePositivePercentCeil(p + n + 1, p), pp,
                positiveExamplePercentProportion.intValue())) {
            n++;
            pp = calculatePositivePercentCeil(p + n, p);
        }
    }

    if (verbose)
        System.out
                .println("[DifficultyResamplingManager , calculateSubdatasetDimensionsForProportion] finale p="
                        + p + " n=" + n + " pp = " + pp);
    return new SubdatasetDimensions(p, n);
}

From source file:entity.DifficultyResamplingManager.java

License:Open Source License

/**
 * called by generateResampledSubdataset
 * /*from   w w  w . j  a  v  a  2s  .co m*/
 * @param originalDataset
 * @param subdatasetDimensions
 * @return
 */
private Instances generateResampledSubdataset(Instances originalDataset,
        SubdatasetDimensions subdatasetDimensions) {

    // creates an empty dataset
    Instances resampledSubdataset = new Instances(originalDataset);
    resampledSubdataset.delete();

    // randomize dataset instances order
    originalDataset.randomize(RandomizationManager.randomGenerator);

    // calc number of positives to insert
    int positivesToInsert = subdatasetDimensions.getP();
    if (verbose)
        System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] positivesToInsert = "
                + positivesToInsert);

    // calc number of negatives to insert
    int negativesToInsert = subdatasetDimensions.getN();

    // iterates over the original dataset instances
    for (int i = 0; i < originalDataset.numInstances(); i++) {
        // if instance is positive and more are needed in the new dataset, inserts into new dataset
        if ((positivesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex())
                .equals(Settings.buggyLabel))) {
            resampledSubdataset.add(originalDataset.instance(i));
            positivesToInsert--;
        }

        // if instance is negative and more are needed in the new dataset, inserts into new dataset
        if ((negativesToInsert > 0) && (originalDataset.instance(i).stringValue(originalDataset.classIndex())
                .equals(Settings.nonbuggyLabel))) {
            resampledSubdataset.add(originalDataset.instance(i));
            negativesToInsert--;
        }

    }

    if (verbose)
        System.out.println("[DifficultyResamplingManager, generateResampledSubdataset] resampling terminato: "
                + this.printDatasetInfo(resampledSubdataset));
    return resampledSubdataset;
}

From source file:entity.DifficultyResamplingManager.java

License:Open Source License

/**
 * prints number of posive and negative instances and respective percentaghes
 * @param dataset//from   w w  w  .  j  a v a 2 s .c o  m
 * @return
 */
public String printDatasetInfo(Instances dataset) {

    int positives = 0;
    int negatives = 0;

    for (int i = 0; i < dataset.numInstances(); i++) {

        if (dataset.instance(i).stringValue(dataset.classIndex()).equals(Settings.buggyLabel)) {
            positives++;
        }

        if (dataset.instance(i).stringValue(dataset.classIndex()).equals(Settings.nonbuggyLabel)) {
            negatives++;
        }
    }

    double percent = ((double) positives / (double) dataset.numInstances()) * 100;
    return new String("totale istanze: " + dataset.numInstances() + ", p+n=" + (positives + negatives) + ", p: "
            + positives + ", n: " + negatives + ", %p : " + percent);
}