List of usage examples for weka.core Instances setClassIndex
public void setClassIndex(int classIndex)
From source file:gr.auth.ee.lcs.utilities.InstancesUtility.java
License:Open Source License
public static Instances[] partitionInstances(final AbstractLearningClassifierSystem lcs, final Instances trainSet) throws Exception { // Open .arff final Instances set = trainSet; if (set.classIndex() < 0) { set.setClassIndex(set.numAttributes() - 1); }//from w w w.java 2 s . co m //set.randomize(new Random()); int numberOfLabels = (int) SettingsLoader.getNumericSetting("numberOfLabels", 1); // the partitions vector holds the indices String stringsArray[] = new String[trainSet.numInstances()]; int indicesArray[] = new int[trainSet.numInstances()]; // convert each instance's labelset into a string and store it in the stringsArray array for (int i = 0; i < set.numInstances(); i++) { stringsArray[i] = ""; indicesArray[i] = i; for (int j = set.numAttributes() - numberOfLabels; j < set.numAttributes(); j++) { stringsArray[i] += (int) set.instance(i).value(j); } } // contains the indicesVector(s) Vector<Vector> mothershipVector = new Vector<Vector>(); String baseString = ""; for (int i = 0; i < set.numInstances(); i++) { baseString = stringsArray[i]; if (baseString.equals("")) continue; Vector<Integer> indicesVector = new Vector<Integer>(); for (int j = 0; j < set.numInstances(); j++) { if (baseString.equals(stringsArray[j])) { stringsArray[j] = ""; indicesVector.add(j); } } mothershipVector.add(indicesVector); } Instances[] partitions = new Instances[mothershipVector.size()]; for (int i = 0; i < mothershipVector.size(); i++) { partitions[i] = new Instances(set, mothershipVector.elementAt(i).size()); for (int j = 0; j < mothershipVector.elementAt(i).size(); j++) { Instance instanceToAdd = set.instance((Integer) mothershipVector.elementAt(i).elementAt(j)); partitions[i].add(instanceToAdd); } } /* * up to here, the partitions array has been formed. it contains the split dataset by label combinations * it holds both the attributes and the labels, but for clustering the input should only be the attributes, * so we need to delete the labels. this is taken care of by initializePopulation() */ return partitions; }
From source file:gr.demokritos.iit.cpgislanddetection.analysis.VectorSequenceDetector.java
License:Apache License
public VectorSequenceDetector(List<BaseSequence> sequences, List<String> labels) throws FileNotFoundException, IOException, Exception { //gia ola ta seq //gia kathe seq pare to vector me vash ton analyzer //vale kai to label //kai update classify // load data/*from www . j av a2s. c om*/ ArffLoader loader = new ArffLoader(); loader.setFile(new File("/Desktop/filesForWeka/2o_peirama/dataForWeka.arff")); Instances structure = loader.getStructure(); // setting class attribute structure.setClassIndex(structure.numAttributes() - 1); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable(); nb.buildClassifier(structure); Instance current; while ((current = loader.getNextInstance(structure)) != null) nb.updateClassifier(current); }
From source file:gr.demokritos.iit.cpgislanddetection.CpGIslandDetection.java
License:Apache License
/** * @param args the command line arguments *//* w w w . j av a 2 s . c o m*/ public static void main(String[] args) throws IOException, ParseException, Exception { // String sFileNameArgs = args[0]; // String[] fileNames = null; // Read file //IGenomicSequenceFileReader reader = new SequenceListFileReader(); // String seq ="GCTCTTGACTTTCAGACTTCCTGAAAACAACGTTCTGGTAAGGACAAGGGTT"; // // CpGIslandIdentification iClass = new CpGIslandIdentification(); // boolean b = iClass.identify(seq); // System.out.println("This sequence is a CpG island: " + b); // SequenceListFileReader s = new SequenceListFileReader(); // ArrayList<BaseSequence> alRes = new ArrayList<>(); // // alRes = s.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"); // for(int i=0; i<alRes.size(); i++) // System.out.println("alRes = " + i + alRes.get(i)); // VectorAnalyzer vA = new VectorAnalyzer(); // List<Vector<Integer>> listVector = new ArrayList<>(); //Vector<Vector<Integer>> list = // listVector = vA.analyze(alRes); // for(int i=0; i<listVector.size();i++) // System.out.println(i + " " +listVector.get(i)); //IGenomicSequenceFileReader reader = new FASTAFileReader(); // If no input file has been given /* if (args.length == 0) { // Use default fileNames[0] = "C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"; fileNames[1] = "C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt"; fileNames[2] = "C:\\Users\\Xenia\\Desktop\\files\\newsamples.txt"; } else // else use the provided one { fileNames = sFileNameArgs.split(";"); } */ //-----------------VECTOR ANALYSIS STARTS HERE-------------------------------------- //read sequences from txt files SequenceListFileReader reader = new SequenceListFileReader(); ArrayList<BaseSequence> lSeqs1 = new ArrayList<>(); ArrayList<BaseSequence> lSeqs2 = new ArrayList<>(); lSeqs1 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\posSamples.txt"); lSeqs2 = reader.getSequencesFromFile("C:\\Users\\Xenia\\Desktop\\files\\negSamples.txt"); //create vectors for every sequence List<Vector<Integer>> listVectorForPositiveSamples = new ArrayList<>(); List<Vector<Integer>> listVectorForNegativeSamples = new ArrayList<>(); VectorAnalyzer v = new VectorAnalyzer(); listVectorForPositiveSamples = v.analyze(lSeqs1); listVectorForNegativeSamples = v.analyze(lSeqs2); //create ARFF files for positive and negative samples FileCreatorARFF fc = new FileCreatorARFF(); Instances positiveInstances = fc.createARFF(listVectorForPositiveSamples, "yes"); Instances negativeInstances = fc.createARFF(listVectorForNegativeSamples, "no"); //System.out.println(positiveInstances); //build and train classifier // setting class attribute positiveInstances.setClassIndex(positiveInstances.numAttributes() - 1); negativeInstances.setClassIndex(negativeInstances.numAttributes() - 1); // train NaiveBayes NaiveBayesUpdateable nb = new NaiveBayesUpdateable(); nb.buildClassifier(positiveInstances); nb.buildClassifier(negativeInstances); Instance current; for (int i = 0; i < positiveInstances.numInstances(); i++) { current = positiveInstances.instance(i); nb.updateClassifier(current); } // Test the model Evaluation eTest = new Evaluation(positiveInstances); Instances isTestingSet = fc.createARFF(listVectorForNegativeSamples, "?"); isTestingSet.setClassIndex(isTestingSet.numAttributes() - 1); eTest.evaluateModel(nb, isTestingSet); //------------------VECTOR ANALYSIS ENDS HERE--------------------------------------- //----------------------------HMM CLASSIFIER STARTS HERE---------------------------------- // Init classifier /* ISequenceClassifier<List<ObservationDiscrete<HmmSequence.Packet>>> classifier = new HmmClassifier(); */ // WARNING: Remember to change when you have normal data!!! // Obfuscation in negative training file? // final boolean bObfuscateNeg = true; // FASTAObfuscatorReader r = new FASTAObfuscatorReader(); //for each file do the same work: train // for (int i = 0; i < 3; i++) { // Read the sequences // If obfuscation is on and we are dealing with the negative // training file /* if ((i == 2) && (bObfuscateNeg)) { //FASTAObfuscatorReader r = new FASTAObfuscatorReader(); lSeqs = r.getSequencesFromFile(fileNames[i]); fileNames[1] = "Not" + fileNames[1]; // Update to indicate different class } else // else read normally lSeqs = reader.getSequencesFromFile(fileNames[i]); System.out.println("lSeqs size="+lSeqs.size()); */ // Create HMM sequences /* ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst = new HmmAnalyzer(); List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs); // Train classifier with the observations classifier.train(lHmmSeqs, new File(fileNames[i]).getName()); } //Classify the test file //First: Read the sequences lSeqs = r.getSequencesFromFile(fileNames[2]); //System.out.println("file name= "+fileNames[2]); //Then: Create HMM sequences ISequenceAnalyst<List<ObservationDiscrete<HmmSequence.Packet>>> analyst = new HmmAnalyzer(); List<List<ObservationDiscrete<HmmSequence.Packet>>> lHmmSeqs = analyst.analyze(lSeqs); */ //-------------------------------HMM CLASSIFIER ENDS HERE----------------------------------------- /* //----------------------------HMM EVALUATION STARTS----------------------------------------------- //System.out.println("size of lHmmSeqs="+ lHmmSeqs.size()); String str = null; String[] savedResults = new String[lHmmSeqs.size()]; //create a 2x2 array to store successes and failures for each class int[][] matrix = new int[2][2]; int successForCpG = 0, failForCpG = 0, successForNotCpG = 0, failForNotCpG = 0; // Init identifier // CpGIslandIdentification identifier = new CpGIslandIdentification(); CpGIslandIdentification identifier = new CpGIslandIdentificationByList("CpG_hg18.fa"); for (int i = 0; i < lHmmSeqs.size(); i++) { // DEBUG System.err.print("."); if (i % 10 == 0) System.err.println(); //////// str = classifier.classify(lHmmSeqs.get(i)); // System.out.println( "i="+i); System.out.println("Determined class:" + str); // savedResults[i] = str; //kalw sunarthsh pou exetazei an to sequence ikanopoiei ta CpG criterias if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[0])) { //Success for CpG class successForCpG++; System.out.println("successForCpG" + successForCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) && str.equals(fileNames[1])) { //fail for CpG class failForCpG++; System.out.println("failForCpG" + failForCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[1])) { //System.out.println(i); //Success for Not CpG class successForNotCpG++; System.out.println("successForNotCpG" + successForNotCpG); } else if (identifier.identify(lSeqs.get(i).getSymbolSequence()) == false && str.equals(fileNames[0])) { //fail for Not CpG class failForNotCpG++; System.out.println("failForNotCpG" + failForNotCpG); } } //Evaluation: calculation of classification rate and accuracy double totalAccuracy = (successForNotCpG + successForCpG)/(successForCpG + failForCpG + failForNotCpG + successForNotCpG); //missclassification rate for CpG class double rate1 = ( failForCpG + successForCpG ) != 0 ? failForCpG / ( failForCpG + successForCpG ) : 0.0; //missclassification rate for Not CpG class double rate2 = ( failForNotCpG + successForNotCpG ) != 0 ? failForNotCpG / ( failForNotCpG + successForNotCpG ) : 0.0; System.out.println(totalAccuracy +" "+ rate1 + " "+ rate2); NGramGraphClassifier nGramGraphClassifier = new NGramGraphClassifier(); List<List<DocumentNGramGraph>> representation; NGramGraphAnalyzer myAnalyst = new NGramGraphAnalyzer(); representation = myAnalyst.analyze(lSeqs); for(int i=0; i<representation.size();i++) nGramGraphClassifier.classify(representation.get(i)); */ }
From source file:gr.demokritos.iit.cpgislanddetection.io.FileCreatorARFF.java
public Instances createARFF(List<Vector<Integer>> listVector, String nameClass) throws ParseException { // Declare four numeric attributes Attribute Attribute1 = new Attribute("adenine"); Attribute Attribute2 = new Attribute("thymine"); Attribute Attribute3 = new Attribute("cytosine"); Attribute Attribute4 = new Attribute("guanine"); // Declare the class attribute along with its values FastVector fvClassVal = new FastVector(2); fvClassVal.addElement("yes"); fvClassVal.addElement("no"); Attribute ClassAttribute = new Attribute("theClass", fvClassVal); // Declare the feature vector FastVector fvWekaAttributes = new FastVector(5); fvWekaAttributes.addElement(Attribute1); fvWekaAttributes.addElement(Attribute2); fvWekaAttributes.addElement(Attribute3); fvWekaAttributes.addElement(Attribute4); fvWekaAttributes.addElement(ClassAttribute); // Create an empty training set int capacity = listVector.size() + 7; Instances isTrainingSet = new Instances("isCpG", fvWekaAttributes, capacity); // Set class index isTrainingSet.setClassIndex(4); // Create the instances from the file with vectors for (int i = 0; i < listVector.size(); i++) { Instance instance = new Instance(5); instance.setValue((Attribute) fvWekaAttributes.elementAt(0), listVector.get(i).get(0)); instance.setValue((Attribute) fvWekaAttributes.elementAt(1), listVector.get(i).get(1)); instance.setValue((Attribute) fvWekaAttributes.elementAt(2), listVector.get(i).get(2)); instance.setValue((Attribute) fvWekaAttributes.elementAt(3), listVector.get(i).get(3)); instance.setValue((Attribute) fvWekaAttributes.elementAt(4), nameClass); //add the instance in training set isTrainingSet.add(instance);//from w w w . ja v a 2 s . c o m } System.out.println(isTrainingSet); return isTrainingSet; }
From source file:gr.iti.mklab.visual.quantization.SimpleKMeansWithOutput.java
License:Open Source License
/** * Generates a clusterer. Has to initialize all fields of the clusterer that are not being set via * options./*from w w w. j av a 2 s . com*/ * * @param data * set of instances serving as training data * @throws Exception * if the clusterer has not been generated successfully */ @Override public void buildClusterer(Instances data) throws Exception { // can clusterer handle the data? getCapabilities().testWithFail(data); m_Iterations = 0; m_ReplaceMissingFilter = new ReplaceMissingValues(); Instances instances = new Instances(data); instances.setClassIndex(-1); if (!m_dontReplaceMissing) { m_ReplaceMissingFilter.setInputFormat(instances); instances = Filter.useFilter(instances, m_ReplaceMissingFilter); } m_FullMissingCounts = new int[instances.numAttributes()]; if (m_displayStdDevs) { m_FullStdDevs = new double[instances.numAttributes()]; } m_FullNominalCounts = new int[instances.numAttributes()][0]; m_FullMeansOrMediansOrModes = moveCentroid(0, instances, false, false); for (int i = 0; i < instances.numAttributes(); i++) { m_FullMissingCounts[i] = instances.attributeStats(i).missingCount; if (instances.attribute(i).isNumeric()) { if (m_displayStdDevs) { m_FullStdDevs[i] = Math.sqrt(instances.variance(i)); } if (m_FullMissingCounts[i] == instances.numInstances()) { m_FullMeansOrMediansOrModes[i] = Double.NaN; // mark missing as mean } } else { m_FullNominalCounts[i] = instances.attributeStats(i).nominalCounts; if (m_FullMissingCounts[i] > m_FullNominalCounts[i][Utils.maxIndex(m_FullNominalCounts[i])]) { m_FullMeansOrMediansOrModes[i] = -1; // mark missing as most common // value } } } m_ClusterCentroids = new Instances(instances, m_NumClusters); int[] clusterAssignments = new int[instances.numInstances()]; if (m_PreserveOrder) m_Assignments = clusterAssignments; m_DistanceFunction.setInstances(instances); Random RandomO = new Random(getSeed()); int instIndex; HashMap initC = new HashMap(); DecisionTableHashKey hk = null; Instances initInstances = null; if (m_PreserveOrder) initInstances = new Instances(instances); else initInstances = instances; if (m_initializeWithKMeansPlusPlus) { kMeansPlusPlusInit(initInstances); } else { for (int j = initInstances.numInstances() - 1; j >= 0; j--) { instIndex = RandomO.nextInt(j + 1); hk = new DecisionTableHashKey(initInstances.instance(instIndex), initInstances.numAttributes(), true); if (!initC.containsKey(hk)) { m_ClusterCentroids.add(initInstances.instance(instIndex)); initC.put(hk, null); } initInstances.swap(j, instIndex); if (m_ClusterCentroids.numInstances() == m_NumClusters) { break; } } } m_NumClusters = m_ClusterCentroids.numInstances(); // removing reference initInstances = null; int i; boolean converged = false; int emptyClusterCount; Instances[] tempI = new Instances[m_NumClusters]; m_squaredErrors = new double[m_NumClusters]; m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; m_ClusterMissingCounts = new int[m_NumClusters][instances.numAttributes()]; startExecutorPool(); long start = System.currentTimeMillis(); while (!converged) { emptyClusterCount = 0; m_Iterations++; converged = true; System.out.print(new Date() + ": " + "Iter " + m_Iterations + " "); if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < instances.numInstances(); i++) { Instance toCluster = instances.instance(i); int newC = clusterProcessedInstance(toCluster, true, true); if (newC != clusterAssignments[i]) { converged = false; } clusterAssignments[i] = newC; } } else { converged = launchAssignToClusters(instances, clusterAssignments); } // update centroids m_ClusterCentroids = new Instances(instances, m_NumClusters); for (i = 0; i < m_NumClusters; i++) { tempI[i] = new Instances(instances, 0); } for (i = 0; i < instances.numInstances(); i++) { tempI[clusterAssignments[i]].add(instances.instance(i)); } if (m_executionSlots <= 1 || instances.numInstances() < 2 * m_executionSlots) { for (i = 0; i < m_NumClusters; i++) { if (tempI[i].numInstances() == 0) { // empty cluster emptyClusterCount++; } else { moveCentroid(i, tempI[i], true, true); } } } else { emptyClusterCount = launchMoveCentroids(tempI); } if (m_Iterations == m_MaxIterations) converged = true; if (emptyClusterCount > 0) { m_NumClusters -= emptyClusterCount; if (converged) { Instances[] t = new Instances[m_NumClusters]; int index = 0; for (int k = 0; k < tempI.length; k++) { if (tempI[k].numInstances() > 0) { t[index++] = tempI[k]; } } tempI = t; } else { tempI = new Instances[m_NumClusters]; } } if (!converged) { m_ClusterNominalCounts = new int[m_NumClusters][instances.numAttributes()][0]; } System.out.println("Sum of within cluster distances: " + Utils.sum(m_squaredErrors)); // reset erros to zero m_squaredErrors = new double[m_NumClusters]; } long end = System.currentTimeMillis(); System.out.println("\nClustering completed in " + (end - start) + " ms and converged in " + m_Iterations + " iterations"); // calculate errors if (!m_FastDistanceCalc) { for (i = 0; i < instances.numInstances(); i++) { clusterProcessedInstance(instances.instance(i), true, false); } } if (m_displayStdDevs) { m_ClusterStdDevs = new Instances(instances, m_NumClusters); } m_ClusterSizes = new int[m_NumClusters]; for (i = 0; i < m_NumClusters; i++) { if (m_displayStdDevs) { double[] vals2 = new double[instances.numAttributes()]; for (int j = 0; j < instances.numAttributes(); j++) { if (instances.attribute(j).isNumeric()) { vals2[j] = Math.sqrt(tempI[i].variance(j)); } else { vals2[j] = Utils.missingValue(); } } m_ClusterStdDevs.add(new DenseInstance(1.0, vals2)); } m_ClusterSizes[i] = tempI[i].numInstances(); } m_executorPool.shutdown(); }
From source file:gr.ntua.ece.cslab.panic.core.models.AbstractWekaModel.java
License:Apache License
/** * Converts an input space point to a Weka instance. * @param point// w ww. j a v a2s . com * @return */ public static Instance convertPointToInstance(InputSpacePoint point, OutputSpacePoint outputPoint) { Instance inst = new Instance(point.numberDimensions() + outputPoint.numberDimensions()); int index = 0; for (String k : point.getKeysAsCollection()) { Attribute att = new Attribute(k, index++); inst.setValue(att, point.getValue(k)); } for (Entry<String, Double> e : outputPoint.getOutputPoints().entrySet()) { if (e.getValue() == null) { inst.setMissing(index++); } else { Attribute att = new Attribute(e.getKey(), index++); inst.setValue(att, e.getValue()); } } //assign instance to dataset FastVector att = new FastVector(point.numberDimensions() + 1); for (String s : point.getKeysAsCollection()) att.addElement(new Attribute(s, index++)); for (String k : outputPoint.getOutputPoints().keySet()) { att.addElement(new Attribute(k, index++)); } Instances dataset = new Instances("instances", att, point.numberDimensions() + 1); dataset.setClassIndex(dataset.numAttributes() - 1); inst.setDataset(dataset); return inst; }
From source file:gr.ntua.ece.cslab.panic.core.models.AbstractWekaModel.java
License:Apache License
public static Instance convertPointToInstance(InputSpacePoint point) { Instance inst = new Instance(point.numberDimensions() + 1); int index = 0; for (String k : point.getKeysAsCollection()) { Attribute att = new Attribute(k, index++); inst.setValue(att, point.getValue(k)); }/*from ww w.j a v a 2 s. com*/ inst.setMissing(index); //assign instance to dataset FastVector att = new FastVector(point.numberDimensions() + 1); for (String s : point.getKeysAsCollection()) att.addElement(new Attribute(s, index++)); att.addElement(new Attribute("objective", index++)); Instances dataset = new Instances("instances", att, point.numberDimensions() + 1); dataset.setClassIndex(dataset.numAttributes() - 1); inst.setDataset(dataset); return inst; }
From source file:gr.ntua.ece.cslab.panic.core.models.AbstractWekaModel.java
License:Apache License
/** * Creates a new dataset out of a OutputSpacePoint list. * @param points/*from ww w.ja va 2s. c o m*/ * @return */ protected static Instances getInstances(List<OutputSpacePoint> points) { OutputSpacePoint first = points.get(0); FastVector att = new FastVector(first.getInputSpacePoint().numberDimensions() + first.numberDimensions()); int index = 0; for (String s : first.getInputSpacePoint().getKeysAsCollection()) att.addElement(new Attribute(s, index++)); for (String s : first.getOutputPoints().keySet()) att.addElement(new Attribute(s, index++)); Instances instances = new Instances("instances", att, first.getInputSpacePoint().numberDimensions() + first.numberDimensions()); for (OutputSpacePoint p : points) { Instance i = convertPointToInstance(p.getInputSpacePoint(), p); instances.add(i); //System.out.println(i); } instances.setClassIndex(first.getInputSpacePoint().numberDimensions()); return instances; }
From source file:gr.uoc.nlp.opinion.analysis.suggestion.AnalyzeSuggestions.java
private Instances retriveTrainSet() { System.out.println("Retrieving dataset from Database.."); InstanceQuery query;/*from w w w .jav a 2 s. c o m*/ try { //initialize database, weka api query = new InstanceQuery(); //set database attributes, weka apit query.setDatabaseURL(this.connection.getJdbcUrl()); query.setUsername(this.connection.getUsername()); query.setPassword(this.connection.getPassword()); query.setQuery(this.queryTrainset()); //retrieve trainset Instances data = query.retrieveInstances(); data.setClassIndex(data.numAttributes() - 1); System.out.println("Done retrieving dataset from Database!"); return data; } catch (Exception ex) { System.err.println("Abort!"); Logger.getLogger(AnalyzeArguments.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:gr.uoc.nlp.opinion.analysis.suggestion.AnalyzeSuggestions.java
/** * * @param classifier//from ww w . j av a 2 s .c o m * @param unclassified * @return */ public Instances classify(Classifier classifier, Instances unclassified) { unclassified.setClassIndex(unclassified.numAttributes() - 1); //new set wich will contain classifies instances Instances classified = new Instances(unclassified); double clsLabel; try { for (int i = 0; i < unclassified.numInstances(); i++) { //for each unclassifies, classify clsLabel = classifier.classifyInstance(unclassified.instance(i)); //append result to final set classified.instance(i).setClassValue(clsLabel); } } catch (Exception ex) { Logger.getLogger(AnalyzeArguments.class.getName()).log(Level.SEVERE, null, ex); } return classified; }