List of usage examples for weka.core Instances numAttributes
publicint numAttributes()
From source file:edu.teco.context.recognition.WekaManager.java
License:Apache License
private void fillData(double[] featureValues, String className, Instances data) { double[] vals = new double[data.numAttributes()]; if (vals.length != (featureValues.length + 1)) { if (FrameworkContext.WARN) Log.w(TAG, "Number of feature values and weka instance values differs."); }/*from ww w .j av a 2 s . c o m*/ for (int i = 0; i < featureValues.length; i++) { vals[i] = featureValues[i]; } vals[vals.length - 1] = attClassVals.indexOf(className); DenseInstance instance = new DenseInstance(1.0, vals); if (isLogDirectlyToFile) { instance.setDataset(data); logArffData(instance.toString()); } else { // add data.add(instance); } }
From source file:edu.uga.cs.fluxbuster.classification.Classifier.java
License:Open Source License
/** * Executes the classifier.//from www . j a v a 2 s. c o m * * @param prepfeatures the prepared features in arff format * @param modelfile the path to the serialized model * @param clusters the clusters to classify * @return a map of the classified clusters, the keys are the classes * and the values are lists of cluster id's belonging to those classes */ private Map<ClusterClass, List<StoredDomainCluster>> executeClassifier(String prepfeatures, String modelfile, List<StoredDomainCluster> clusters) { Map<ClusterClass, List<StoredDomainCluster>> retval = new HashMap<ClusterClass, List<StoredDomainCluster>>(); try { DataSource source = new DataSource(new ByteArrayInputStream(prepfeatures.getBytes())); Instances data = source.getDataSet(); if (data.classIndex() == -1) { data.setClassIndex(data.numAttributes() - 1); } String[] options = weka.core.Utils.splitOptions("-p 0"); J48 cls = (J48) weka.core.SerializationHelper.read(modelfile); cls.setOptions(options); for (int i = 0; i < data.numInstances(); i++) { double pred = cls.classifyInstance(data.instance(i)); ClusterClass clusClass = ClusterClass .valueOf(data.classAttribute().value((int) pred).toUpperCase()); if (!retval.containsKey(clusClass)) { retval.put(clusClass, new ArrayList<StoredDomainCluster>()); } retval.get(clusClass).add(clusters.get(i)); } } catch (Exception e) { if (log.isErrorEnabled()) { log.error("Error executing classifier.", e); } } return retval; }
From source file:edu.umbc.cs.maple.utils.WekaUtils.java
License:Open Source License
/** Converts a set of instances to svm-light format * @param data the weka instances//from w w w.j ava 2 s . c o m * @return the weka instances in svm-light format */ public static String arffToSVMLight(Instances data, SVMLightLabelFormat labelFormat) { if (labelFormat == SVMLightLabelFormat.CLASSIFICATION && data.numClasses() != 2) { throw new IllegalArgumentException( "SVM-light classification label format requires that the data contain only two classes."); } String str = ""; String endline = System.getProperty("line.separator"); int numInstances = data.numInstances(); int numAttributes = data.numAttributes(); int classAttIdx = data.classIndex(); for (int instIdx = 0; instIdx < numInstances; instIdx++) { Instance inst = data.instance(instIdx); // convert the instance label if (labelFormat == SVMLightLabelFormat.CLASSIFICATION) { str += (inst.classValue() == 0) ? "-1" : "1"; } else { str += inst.classValue(); } str += " "; // convert each feature for (int attIdx = 0; attIdx < numAttributes; attIdx++) { // skip the class attribute if (attIdx == classAttIdx) continue; str += (attIdx + 1) + ":" + inst.value(attIdx) + " "; } // append the instance info string str += "# " + instIdx; str += endline; } return str; }
From source file:edu.utexas.cs.tactex.utils.RegressionUtils.java
License:Open Source License
/** * adding y attributes with values/*from w w w .j a v a2s. c om*/ */ public static Instances addYforWeka(Instances xInsts, Double[] y) { Instances xyInsts = addYforWeka(xInsts); if (y.length != xInsts.numInstances()) { log.error("cannot add y to instances since y.length != numInstances"); } // initialize all y values int n = xInsts.numAttributes() - 1; for (int i = 0; i < y.length; ++i) { xInsts.get(i).setValue(n, y[i]); } return xyInsts; }
From source file:edu.utexas.cs.tactex.utils.RegressionUtils.java
License:Open Source License
/** * adding y attributes without giving it values *///from ww w .j a v a2s . com public static Instances addYforWeka(Instances xInsts) { // add another column for y int n = xInsts.numAttributes(); xInsts.insertAttributeAt(new Attribute(Integer.toString(n)), n); // last attribute is y value, the class 'label' xInsts.setClassIndex(n); return xInsts; }
From source file:edu.washington.cs.knowitall.summarization.RedundancyClassifier.java
License:Open Source License
public Instances setupInstances(StringReader testReader) { Instances instances = null; try {/*from w ww. j a va2 s.co m*/ instances = new Instances(testReader); } catch (IOException e) { e.printStackTrace(); } instances.setClassIndex(instances.numAttributes() - 1); testReader.close(); return instances; }
From source file:edu.washington.cs.knowitall.utilities.Classifier.java
License:Open Source License
/** * Set up the instances from the reader//from w w w . j a va2 s.com * @param instanceReader the source of the instances * @return the instances object */ public Instances setupInstances(Reader instanceReader) { Instances instances = null; try { instances = new Instances(instanceReader); } catch (IOException e) { e.printStackTrace(); } instances.setClassIndex(instances.numAttributes() - 1); try { instanceReader.close(); } catch (IOException e) { System.err.println("could not close reader"); e.printStackTrace(); System.exit(1); } return instances; }
From source file:eksploracja.Eksploracja.java
/** * @param args the command line arguments *///w w w. j a v a 2 s . c o m public static void main(String[] args) throws Exception { // TODO code application logic here //sout +tabualcja System.out.println("Hello world - tu eksploracja"); //Pobieranie danych String filename = "C:\\Program Files\\Weka-3-8\\data\\weather.numeric.arff"; DataSource source = new DataSource(filename); Instances mojeDane = source.getDataSet(); //Wywietlanie danych System.out.println("Dane: "); // System.out.println(mojeDane); //cao danych Instance wiersz0 = mojeDane.firstInstance(); System.out.println("Pocztek " + mojeDane.firstInstance()); //pierwszy wiersz System.out.println("Koniec " + mojeDane.lastInstance()); //ostatni wiersz System.out.println("\nLiczba danych: " + mojeDane.numInstances()); System.out.println("\nAtrybuty w liczbie: " + mojeDane.numAttributes()); for (int i = 0; i < mojeDane.numAttributes(); i++) { System.out.println(i + ". " + mojeDane.attribute(i)); Attribute atr = mojeDane.attribute(i); System.out.println(i + " " + atr.name()); if (atr.isNominal()) { System.out.println("Typ danych nominalne"); } else { System.out.println("Typ danych numeryczne"); } } //Zapisywanie danych w posataci liczbowej System.out.println("Dane - jako liczby: "); System.out.println(Arrays.toString(wiersz0.toDoubleArray())); }
From source file:elh.eus.absa.Features.java
License:Open Source License
/** * Creates a feature set from a previously saved model. This allows to load previously saved feature sets. * // w ww . ja v a 2 s. c om * @param model string: path to the serialized model containing header information * @throws IOException */ private void createFeatureSetFromModel(String model) throws IOException { try { WekaWrapper ww = new WekaWrapper(model); Instances header = ww.loadHeader(model); int attNum = header.numAttributes(); for (int i = 0; i < attNum; i++) { Attribute att = header.attribute(i); String name = att.name(); if (att.isNumeric()) { addNumericFeature(name); //System.out.println("numeric feature: "+name); } else if (att.isNominal()) { //System.out.println("nominal feature: "+name+" - "+att.toString()); ArrayList<String> vals = new ArrayList<String>(); Enumeration<Object> e = att.enumerateValues(); while (e.hasMoreElements()) { vals.add(e.nextElement().toString()); } addNominalFeature(name, vals); } } //General polarity lexicon if (header.attribute("polLexGen_posScore") != null) { this.polarLexiconGen = new Lexicon(new File(params.getProperty("polarLexiconGeneral")), "lemma"); System.err.println("Features : createFeatureSet() - General polarity lexicon loaded -> " + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size() + " entries)"); System.out.println("Features : createFeatureSet() - General polarity lexicon loaded -> " + params.getProperty("polarLexiconGeneral") + " (" + this.polarLexiconGen.size() + " entries)"); } //Domain polarity lexicon if (header.attribute("polLexDom_posScore") != null) { //this.polarLexiconDom = loadPolarityLexiconFromFile(params.getProperty("polarLexiconDomain"), "polLexDom_"); this.polarLexiconDom = new Lexicon(new File(params.getProperty("polarLexiconDomain")), "lemma"); System.err.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> " + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size() + " entries)"); System.out.println("Features : createFeatureSet() - Domain polarity lexicon loaded -> " + params.getProperty("polarLexiconDomain") + " (" + this.polarLexiconDom.size() + " entries)"); } // Load clark cluster category info from files loadClusterFeatures("clark"); // Load brown cluster category info from files loadClusterFeatures("brown"); // Load word2vec cluster category info from files loadClusterFeatures("word2vec"); } catch (Exception e) { System.err.println("Features::createFeatureSetFromFile -> error when loading model header"); e.printStackTrace(); } }
From source file:elh.eus.absa.Features.java
License:Open Source License
/** * Function fills the attribute vectors for the instances existing in the corpus given. * Attribute vectors contain the features loaded by the creatFeatureSet() function. * /* w ww . j a v a 2s . c om*/ * @param boolean save : whether the Instances file should be saved to an arff file or not. * @return Weka Instances object containing the attribute vectors filled with the features specified * in the parameter file. */ public Instances loadInstances(boolean save, String prefix) throws IOException { String savePath = params.getProperty("fVectorDir") + File.separator + "arff" + File.separator + "train_" + prefix; HashMap<String, Opinion> trainExamples = corpus.getOpinions(); int trainExamplesNum = trainExamples.size(); int bowWin = 0; if (params.containsKey("window")) { bowWin = Integer.parseInt(params.getProperty("window")); savePath = savePath + "_w" + bowWin; } //Properties posProp = new Properties(); //eus.ixa.ixa.pipe.pos.Annotate postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); if (params.containsKey("lemmaNgrams")) { Properties posProp = NLPpipelineWrapper.setPostaggerProperties(params.getProperty("pos-model"), params.getProperty("lemma-model"), corpus.getLang(), "bin", "false"); postagger = new eus.ixa.ixa.pipe.pos.Annotate(posProp); } //System.out.println("train examples: "+trainExamplesNum); //Create the Weka object for the training set Instances rsltdata = new Instances("train", atts, trainExamplesNum); // setting class attribute (last attribute in train data. //traindata.setClassIndex(traindata.numAttributes() - 1); System.err.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); System.out.println("Features: loadInstances() - featNum: " + this.featNum + " - trainset attrib num -> " + rsltdata.numAttributes() + " - "); int instId = 1; // fill the vectors for each training example for (String oId : trainExamples.keySet()) { //System.err.println("sentence: "+ corpus.getOpinionSentence(o.getId())); //value vector double[] values = new double[featNum]; // first element is the instanceId values[rsltdata.attribute("instanceId").index()] = instId; // string normalization (emoticons, twitter grammar,...) String opNormalized = corpus.getOpinionSentence(oId); // compute uppercase ratio before normalization (if needed) double upRatio = 0.0; if (params.getProperty("upperCaseRatio", "no").equalsIgnoreCase("yes")) { String upper = opNormalized.replaceAll("[\\p{Ll}]", ""); upRatio = (double) upper.length() / (double) opNormalized.length(); values[rsltdata.attribute("upperCaseRation").index()] = upRatio; } // string normalization (emoticons, twitter grammar,...) if ((params.containsKey("wfngrams") || params.containsKey("lemmaNgrams")) && (!params.getProperty("normalization", "none").equalsIgnoreCase("noEmot"))) { opNormalized = normalize(opNormalized, params.getProperty("normalization", "none")); } //process the current instance with the NLP pipeline in order to get token and lemma|pos features KAFDocument nafinst = new KAFDocument("", ""); String nafname = trainExamples.get(oId).getsId().replace(':', '_'); String nafDir = params.getProperty("kafDir"); String nafPath = nafDir + File.separator + nafname + ".kaf"; //counter for opinion sentence token number. Used for computing relative values of the features int tokNum = 1; try { if (params.containsKey("lemmaNgrams")) //(lemmaNgrams != null) && (!lemmaNgrams.isEmpty())) { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTokPos(opNormalized, corpus.getLang(), params.getProperty("pos-model"), params.getProperty("lemma-model"), postagger); Files.createDirectories(Paths.get(nafDir)); nafinst.save(nafPath); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - postagging opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } else { if (FileUtilsElh.checkFile(nafPath)) { nafinst = KAFDocument.createFromFile(new File(nafPath)); } else { nafinst = NLPpipelineWrapper.ixaPipesTok(opNormalized, corpus.getLang()); } tokNum = nafinst.getWFs().size(); //System.err.println("Features::loadInstances - tokenizing opinion sentence ("+oId+") - "+corpus.getOpinionSentence(oId)); } } catch (IOException | JDOMException e) { System.err.println("Features::loadInstances() - error when NLP processing the instance " + instId + "|" + oId + ") for filling the attribute vector"); e.printStackTrace(); System.exit(5); } LinkedList<String> ngrams = new LinkedList<String>(); int ngramDim; try { ngramDim = Integer.valueOf(params.getProperty("wfngrams")); } catch (Exception e) { ngramDim = 0; } boolean polNgrams = false; if (params.containsKey("polNgrams")) { polNgrams = params.getProperty("polNgrams").equalsIgnoreCase("yes"); } List<WF> window = nafinst.getWFs(); Integer end = corpus.getOpinion(oId).getTo(); // apply window if window active (>0) and if the target is not null (to=0) if ((bowWin > 0) && (end > 0)) { Integer start = corpus.getOpinion(oId).getFrom(); Integer to = window.size(); Integer from = 0; end++; for (int i = 0; i < window.size(); i++) { WF wf = window.get(i); if ((wf.getOffset() == start) && (i >= bowWin)) { from = i - bowWin; } else if (wf.getOffset() >= end) { if (i + bowWin < window.size()) { to = i + bowWin; } break; } } window = window.subList(from, to); //System.out.println("startTgt: "+start+" - from: "+from+" | endTrgt:"+(end-1)+" - to:"+to); } //System.out.println("Sentence: "+corpus.getOpinionSentence(oId)+" - target: "+corpus.getOpinion(oId).getTarget()+ // "\n window: from-> "+window.get(0).getForm()+" to-> "+window.get(window.size()-1)+" .\n"); List<String> windowWFIds = new ArrayList<String>(); // word form ngram related features for (WF wf : window) { windowWFIds.add(wf.getId()); String wfStr = wf.getForm(); if (params.containsKey("wfngrams") && ngramDim > 0) { if (!savePath.contains("_wf" + ngramDim)) { savePath = savePath + "_wf" + ngramDim; } //if the current word form is in the ngram list activate the feature in the vector if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(wfStr); // add ngrams to the feature vector checkNgramFeatures(ngrams, values, "wf", 1, false); //toknum } // Clark cluster info corresponding to the current word form if (params.containsKey("clark") && attributeSets.get("ClarkCl").containsKey(wfStr)) { if (!savePath.contains("_cl")) { savePath = savePath + "_cl"; } values[rsltdata.attribute("ClarkClId_" + attributeSets.get("ClarkCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("brown") && attributeSets.get("BrownCl").containsKey(wfStr)) { if (!savePath.contains("_br")) { savePath = savePath + "_br"; } values[rsltdata.attribute("BrownClId_" + attributeSets.get("BrownCl").get(wfStr)).index()]++; } // Clark cluster info corresponding to the current word form if (params.containsKey("word2vec") && attributeSets.get("w2vCl").containsKey(wfStr)) { if (!savePath.contains("_w2v")) { savePath = savePath + "_w2v"; } values[rsltdata.attribute("w2vClId_" + attributeSets.get("w2vCl").get(wfStr)).index()]++; } } //empty ngram list and add remaining ngrams to the feature list checkNgramFeatures(ngrams, values, "wf", 1, true); //toknum // PoS tagger related attributes: lemmas and pos tags if (params.containsKey("lemmaNgrams") || (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { ngrams = new LinkedList<String>(); if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { ngramDim = Integer.valueOf(params.getProperty("lemmaNgrams")); } else { ngramDim = 3; } LinkedList<String> posNgrams = new LinkedList<String>(); int posNgramDim = 0; if (params.containsKey("pos")) { posNgramDim = Integer.valueOf(params.getProperty("pos")); } for (Term t : nafinst.getTermsFromWFs(windowWFIds)) { //lemmas // && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0")) if ((params.containsKey("lemmaNgrams")) || params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { if (!savePath.contains("_l" + ngramDim)) { savePath = savePath + "_l" + ngramDim; } String lemma = t.getLemma(); if (ngrams.size() >= ngramDim) { ngrams.removeFirst(); } ngrams.add(lemma); // add ngrams to the feature vector for (int i = 0; i < ngrams.size(); i++) { String ng = featureFromArray(ngrams.subList(0, i + 1), "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (params.containsKey("lemmaNgrams") && (!params.getProperty("lemmaNgrams").equalsIgnoreCase("0"))) { Attribute ngAtt = rsltdata.attribute(ng); if (ngAtt != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } } ng = featureFromArray(ngrams.subList(0, i + 1), ""); if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker } //end ngram checking } //pos tags if (params.containsKey("pos") && !params.getProperty("pos").equalsIgnoreCase("0")) { if (!savePath.contains("_p")) { savePath = savePath + "_p"; } if (posNgrams.size() >= posNgramDim) { posNgrams.removeFirst(); } posNgrams.add(t.getPos()); // add ngrams to the feature vector checkNgramFeatures(posNgrams, values, "pos", 1, false); } } //endFor //empty ngram list and add remaining ngrams to the feature list while (!ngrams.isEmpty()) { String ng = featureFromArray(ngrams, "lemma"); //if the current lemma is in the ngram list activate the feature in the vector if (rsltdata.attribute(ng) != null) { addNumericToFeatureVector(ng, values, 1); //tokNum } // polarity lexicons if (params.containsKey("polarLexiconGeneral") || params.containsKey("polarLexiconDomain")) { checkPolarityLexicons(ng, values, tokNum, polNgrams); } //end polarity ngram checker ngrams.removeFirst(); } //empty pos ngram list and add remaining pos ngrams to the feature list checkNgramFeatures(posNgrams, values, "pos", 1, true); } // add sentence length as a feature if (params.containsKey("sentenceLength") && (!params.getProperty("sentenceLength").equalsIgnoreCase("no"))) { values[rsltdata.attribute("sentenceLength").index()] = tokNum; } //create object for the current instance and associate it with the current train dataset. Instance inst = new SparseInstance(1.0, values); inst.setDataset(rsltdata); // add category attributte values String cat = trainExamples.get(oId).getCategory(); if (params.containsKey("categories") && params.getProperty("categories").compareTo("E&A") == 0) { if (cat.compareTo("NULL") == 0) { inst.setValue(rsltdata.attribute("entCat").index(), cat); inst.setValue(rsltdata.attribute("attCat").index(), cat); } else { String[] splitCat = cat.split("#"); inst.setValue(rsltdata.attribute("entCat").index(), splitCat[0]); inst.setValue(rsltdata.attribute("attCat").index(), splitCat[1]); } //inst.setValue(attIndexes.get("entAttCat"), cat); } else if (params.containsKey("categories") && params.getProperty("categories").compareTo("E#A") == 0) { inst.setValue(rsltdata.attribute("entAttCat").index(), cat); } if (params.containsKey("polarity") && params.getProperty("polarity").compareTo("yes") == 0) { // add class value as a double (Weka stores all values as doubles ) String pol = normalizePolarity(trainExamples.get(oId).getPolarity()); //System.err.println("Features::loadInstances - pol "+pol+" for oid "+oId+" - text:"+corpus.getOpinionSentence(oId)); if (pol != null && !pol.isEmpty()) { //System.err.println("polarity: _"+pol+"_"); inst.setValue(rsltdata.attribute("polarityCat"), pol); } else { inst.setMissing(rsltdata.attribute("polarityCat")); } } //add instance to train data rsltdata.add(inst); //store opinion Id and instance Id this.opInst.put(oId, instId); instId++; } System.err.println("Features : loadInstances() - training data ready total number of examples -> " + trainExamplesNum + " - " + rsltdata.numInstances()); if (save) { try { savePath = savePath + ".arff"; System.err.println("arff written to: " + savePath); ArffSaver saver = new ArffSaver(); saver.setInstances(rsltdata); saver.setFile(new File(savePath)); saver.writeBatch(); } catch (IOException e1) { e1.printStackTrace(); } catch (Exception e2) { e2.printStackTrace(); } } return rsltdata; }