List of usage examples for weka.core SparseInstance SparseInstance
public SparseInstance(int numAttributes)
From source file:cluster.ABC.ClusterUtils.java
License:Open Source License
/** Finds the sum of instance sum with instance inst *///from w w w. ja v a2 s . c o m public static Instance sumWithInstance(Instance sum, Instance inst, Instances m_Instances) throws Exception { Instance newSum; if (sum == null) { if (inst instanceof SparseInstance) { newSum = new SparseInstance(inst); newSum.setDataset(m_Instances); } else { newSum = new Instance(inst); newSum.setDataset(m_Instances); } } else { newSum = sumInstances(sum, inst, m_Instances); } return newSum; }
From source file:com.yahoo.research.scoring.classifier.NutchOnlineClassifier.java
License:Apache License
/** * Converts an {@link AnthURL} into an {@link Instance} which can be handled * by the {@link Classifier}./* w w w . ja v a 2s . co m*/ * * @param url * the {@link AnthURL} which should be transformed/converted. * @return the resulting {@link Instance}. */ private static Instance convert(AnthURL url) { if (url != null) { Instance inst = new SparseInstance(dimension); inst.replaceMissingValues(replaceMissingValues); inst.setDataset(instances); inst.setValue(attributesIndex.get("class"), (url.sem ? "sem" : "nonsem")); inst.setValue(attributesIndex.get("sempar"), (url.semFather ? 1 : 0)); inst.setValue(attributesIndex.get("nonsempar"), (url.nonSemFather ? 1 : 0)); inst.setValue(attributesIndex.get("semsib"), (url.semSibling ? 1 : 0)); inst.setValue(attributesIndex.get("nonsempar"), (url.nonSemFather ? 1 : 0)); inst.setValue(attributesIndex.get("domain"), url.uri.getHost()); Set<String> tokens = new HashSet<String>(); tokens.addAll(tokenizer(url.uri.getPath())); tokens.addAll(tokenizer(url.uri.getQuery())); tokens.addAll(tokenizer(url.uri.getFragment())); for (String tok : tokens) { inst.setValue(attributesIndex.get(getAttributeNameOfHash(getHash(tok, hashTrickSize))), 1); } return inst; } else { System.out.println("Input AnthURL for convertion into instance was null."); return null; } }
From source file:edu.cmu.lti.oaqa.baseqa.providers.ml.classifiers.MekaProvider.java
License:Apache License
@Override public Map<String, Double> infer(Map<String, Double> features) throws AnalysisEngineProcessException { Instance instance = new SparseInstance(features.size()); instance.setDataset(datasetSchema);//from w w w . ja v a 2 s .c o m for (Map.Entry<String, Double> e : features.entrySet()) { Attribute attribute = datasetSchema.attribute(e.getKey()); if (attribute == null) continue; instance.setValue(attribute, e.getValue()); } double[] probs; try { probs = classifier.distributionForInstance(instance); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } assert datasetSchema.classIndex() == probs.length; return IntStream.range(0, probs.length).boxed() .collect(toMap(i -> datasetSchema.attribute(i).name(), i -> probs[i])); }
From source file:edu.cmu.lti.oaqa.baseqa.providers.ml.classifiers.MekaProvider.java
License:Apache License
@Override public void train(List<Map<String, Double>> X, List<String> Y, boolean crossValidation) throws AnalysisEngineProcessException { // create attribute (including label) info ArrayList<Attribute> attributes = new ArrayList<>(); List<String> labelNames = ClassifierProvider.labelNames(Y); labelNames.stream().map(attr -> new Attribute(attr, Arrays.asList("y", "n"))) .forEachOrdered(attributes::add); List<String> featureNames = ClassifierProvider.featureNames(X); featureNames.stream().map(Attribute::new).forEachOrdered(attributes::add); String name = Files.getNameWithoutExtension(modelFile.getName()); datasetSchema = new Instances(name, attributes, 0); datasetSchema.setClassIndex(labelNames.size()); // add instances // due to the limitation of the interface definition, X, Y should be reorganized SetMultimap<Map<String, Double>, String> XY = HashMultimap.create(); IntStream.range(0, X.size()).forEach(i -> XY.put(X.get(i), Y.get(i))); Instances trainingInstances = new Instances(datasetSchema, XY.size()); for (Map.Entry<Map<String, Double>, Collection<String>> entry : XY.asMap().entrySet()) { Set<String> y = ImmutableSet.copyOf(entry.getValue()); Map<String, Double> x = entry.getKey(); SparseInstance instance = new SparseInstance(labelNames.size() + x.size()); for (String labelName : labelNames) { instance.setValue(datasetSchema.attribute(labelName), y.contains(labelName) ? "y" : "n"); }/* w ww.ja v a 2s . c om*/ for (Map.Entry<String, Double> e : x.entrySet()) { instance.setValue(datasetSchema.attribute(e.getKey()), e.getValue()); } trainingInstances.add(instance); } // training try { classifier = (MultiLabelClassifier) AbstractClassifier.forName(classifierName, options); classifier.buildClassifier(trainingInstances); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } try { SerializationHelper.write(modelFile.getAbsolutePath(), classifier); SerializationHelper.write(datasetSchemaFile.getAbsolutePath(), datasetSchema); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } if (crossValidation) { try { Evaluation eval = new Evaluation(trainingInstances); Random rand = new Random(); eval.crossValidateModel(classifier, trainingInstances, 10, rand); LOG.debug(eval.toSummaryString()); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } } }
From source file:elh.eus.absa.CLI.java
License:Open Source License
/** * Main access to the train-atc functionalities. Train ATC using a double one vs. all classifier * (E and A) for E#A aspect categories/*from www . ja v a 2 s .c o m*/ * @throws Exception */ public final void trainATC2(final InputStream inputStream) throws IOException { // load training parameters file String paramFile = parsedArguments.getString("params"); String testFile = parsedArguments.getString("testset"); String paramFile2 = parsedArguments.getString("params2"); String corpusFormat = parsedArguments.getString("corpusFormat"); //String validation = parsedArguments.getString("validation"); String lang = parsedArguments.getString("language"); //int foldNum = Integer.parseInt(parsedArguments.getString("foldNum")); //boolean printPreds = parsedArguments.getBoolean("printPreds"); boolean nullSentenceOpinions = parsedArguments.getBoolean("nullSentences"); boolean onlyTest = parsedArguments.getBoolean("testOnly"); double threshold = 0.5; double threshold2 = 0.5; String modelsPath = "/home/inaki/elixa-atp/ovsaModels"; CorpusReader reader = new CorpusReader(inputStream, corpusFormat, nullSentenceOpinions, lang); Features atcTrain = new Features(reader, paramFile, "3"); Instances traindata = atcTrain.loadInstances(true, "atc"); if (onlyTest) { if (FileUtilsElh.checkFile(testFile)) { System.err.println("read from test file"); reader = new CorpusReader(new FileInputStream(new File(testFile)), corpusFormat, nullSentenceOpinions, lang); atcTrain.setCorpus(reader); traindata = atcTrain.loadInstances(true, "atc"); } } //setting class attribute (entCat|attCat|entAttCat|polarityCat) //HashMap<String, Integer> opInst = atcTrain.getOpinInst(); //WekaWrapper classifyAtts; WekaWrapper onevsall; try { //classify.printMultilabelPredictions(classify.multiLabelPrediction()); */ //onevsall Instances entdata = new Instances(traindata); entdata.deleteAttributeAt(entdata.attribute("attCat").index()); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); entdata.setClassIndex(entdata.attribute("entCat").index()); onevsall = new WekaWrapper(entdata, true); if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all models ready"); } onevsall.setTestdata(entdata); HashMap<Integer, HashMap<String, Double>> ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entCat"); System.out.println("trainATC: one vs all predictions ready"); HashMap<Integer, String> instOps = new HashMap<Integer, String>(); for (String oId : atcTrain.getOpinInst().keySet()) { instOps.put(atcTrain.getOpinInst().get(oId), oId); } atcTrain = new Features(reader, paramFile2, "3"); entdata = atcTrain.loadInstances(true, "attTrain2_data"); entdata.deleteAttributeAt(entdata.attribute("entAttCat").index()); //entdata.setClassIndex(entdata.attribute("entCat").index()); Attribute insAtt = entdata.attribute("instanceId"); double maxInstId = entdata.kthSmallestValue(insAtt, entdata.numDistinctValues(insAtt) - 1); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins" + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { Instance newIns = new SparseInstance(currentInst); newIns.setDataset(entdata); entdata.add(newIns); newIns.setValue(insAtt, maxInstId + oSubId); newIns.setClassValue(cl); instOps.put((int) maxInstId + oSubId, oId); } // if the are more create new instances else { currentInst.setClassValue(cl); //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); //Opinion op = new Opinion(instOps.get(i)+"_"+oSubId, "", 0, 0, "", cl, sId); //reader.addOpinion(op); } oSubId++; } } //finished updating instances data } entdata.setClass(entdata.attribute("attCat")); onevsall = new WekaWrapper(entdata, true); /** * Bigarren sailkatzailea * * */ if (!onlyTest) { onevsall.trainOneVsAll(modelsPath, paramFile + "attCat"); System.out.println("trainATC: one vs all attcat models ready"); } ovsaRes = onevsall.predictOneVsAll(modelsPath, paramFile + "entAttCat"); insAtt = entdata.attribute("instanceId"); maxInstId = entdata.kthSmallestValue(insAtt, insAtt.numValues()); System.err.println("last instance has index: " + maxInstId); for (int ins = 0; ins < entdata.numInstances(); ins++) { System.err.println("ins: " + ins); int i = (int) entdata.instance(ins).value(insAtt); Instance currentInst = entdata.instance(ins); //System.err.println("instance "+i+" oid "+kk.get(i+1)+"kk contains key i?"+kk.containsKey(i)); String sId = reader.getOpinion(instOps.get(i)).getsId(); String oId = instOps.get(i); reader.removeSentenceOpinions(sId); int oSubId = 0; for (String cl : ovsaRes.get(i).keySet()) { //System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold2) { ///System.err.println("instance: "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); if (ovsaRes.get(i).get(cl) > threshold) { //System.err.println("one got through ! instance "+i+" class "+cl+" value: "+ovsaRes.get(i).get(cl)); // for the first one update the instances if (oSubId >= 1) { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } // if the are more create new instances else { String label = currentInst.stringValue(entdata.attribute("entAtt")) + "#" + cl; //create and add opinion to the structure // trgt, offsetFrom, offsetTo, polarity, cat, sId); reader.removeOpinion(oId); Opinion op = new Opinion(oId + "_" + oSubId, "", 0, 0, "", label, sId); reader.addOpinion(op); } oSubId++; } } //finished updating instances data } } reader.print2Semeval2015format(paramFile + "entAttCat.xml"); } catch (Exception e) { e.printStackTrace(); } //traindata.setClass(traindata.attribute("entAttCat")); System.err.println("DONE CLI train-atc2 (oneVsAll)"); }
From source file:europarl.PhraseTranslation.java
License:Open Source License
public boolean getFromGz(String fileName, String targetWord, int limit) { String strLine;/*from w w w. j a va 2s . c o m*/ ArrayList<String> line_triple = new ArrayList<String>(); BufferedReader gzipReader; Pattern word_align = Pattern.compile("(\\w+) \\(\\{(.*?)\\}\\) "); Bag<String> words_list = new Bag<String>(); //Set of ALL words: it will be the list of attributes ArrayList<PhraseTranslation> translations = new ArrayList<PhraseTranslation>(); try { gzipReader = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(fileName)))); while ((strLine = gzipReader.readLine()) != null) //read-everything { line_triple.add(strLine); if (line_triple.size() == 3) //triple finished { //TODO: match only complete words //TODO: stem it before doing this Matcher matcher = word_align.matcher(line_triple.get(2)); String[] foreign_words = line_triple.get(1).split(" "); line_triple.clear(); if (!strLine.contains(targetWord)) //skip it continue; ArrayList<String> e_phrase = new ArrayList<String>(); String translation = ""; while (matcher.find()) //each iteration is word +alignment { assert matcher.groupCount() == 2; String e_word = matcher.group(1).trim(); if (e_word.equals("NULL")) e_word = ""; if (stopwordsList.contains(e_word)) continue; if (stemmer != null) e_word = stemmer.stem(e_word); e_phrase.add(e_word); words_list.add(e_word); //we don't care about the alignment of non-target words if (!e_word.equals(targetWord)) continue; //parse the { x y z } alignment part ArrayList<Integer> f_words = new ArrayList<Integer>(); translation = ""; //for each number between curly brackets for (String number : matcher.group(2).split(" ")) { if (!number.isEmpty()) { int n_word = Integer.parseInt(number) - 1; f_words.add(n_word); translation += foreign_words[n_word] + " "; } } // end of curly brackets for } //end of word+alignment while if (!translation.isEmpty()) { PhraseTranslation trans = new PhraseTranslation(e_phrase, translation); translations.add(trans); } line_triple.clear(); } //end of triple-finished if if (translations.size() == limit) break; //stop collecting! } //end of the read-everything while } catch (Exception e) { log.error("Error: " + e); e.printStackTrace(); return false; } //what we NOW have: a set of attributes in HashSet<String>words_list //a ArrayList<PhraseTranslation> translations log.info("Collected " + translations.size() + " phrases and " + words_list.size() + " words"); postProcessData(translations, words_list); //now convert the data we collected to Weka data //we needed to do "double passing" because we need to initialize //the dataset with the complete list of attributes //this will convert word to attributes: they are all "boolean" ArrayList<Attribute> attrs = new ArrayList<Attribute>(); HashMap<String, Attribute> attrs_map = new HashMap<String, Attribute>(); Attribute att; for (String word : words_list) { att = new Attribute(word); attrs.add(att); attrs_map.put(word, att); } //now we need to manage class. //each translation is a class, so we need to get all of them HashMap<String, Integer> class_map = new HashMap<String, Integer>(); ArrayList<String> classes = new ArrayList<String>(); for (PhraseTranslation phraseTranslation : translations) { if (!class_map.containsKey(phraseTranslation.getTranslatedWord())) { class_map.put(phraseTranslation.getTranslatedWord(), classes.size()); classes.add(phraseTranslation.getTranslatedWord()); } } log.info(targetWord + " has " + classes.size() + " translations:"); if (log.isInfoEnabled()) for (String translation : classes) System.out.println(translation); att = new Attribute("%class", classes); attrs.add(att); attrs_map.put("%class", att); dataSet = new Instances("dataset", attrs, 0); for (PhraseTranslation phraseTranslation : translations) { SparseInstance inst = new SparseInstance(attrs.size()); //set everything to 0 for (int i = 0; i < attrs.size(); i++) inst.setValue(i, 0); //set present word to 1 for (String word : phraseTranslation.getPhraseWords()) inst.setValue(attrs_map.get(word), 1); //set class of instance inst.setValue(attrs_map.get("%class"), class_map.get(phraseTranslation.getTranslatedWord())); dataSet.add(inst); } return true; }
From source file:eyetracker.ServerCommunicator.java
public Instance getInput() { // For all the attribute, initialize them. int totalAttribute = MLPProcessor.inst.firstInstance().numAttributes(); Instance instance = new SparseInstance(totalAttribute); instance.setDataset(MLPProcessor.inst); String[] attributes = unifiedData.split(","); //String[] attributes = examData.split(","); for (int i = 0; i < totalAttribute - 1; i++) { instance.setValue(i, Double.valueOf(attributes[i])); }/*from w w w. j a va 2 s. co m*/ return instance; }
From source file:jkamal.ddbmssim.incmine.core.Segment.java
License:Open Source License
/** * Adds a new itemset to the segment// ww w. j a v a 2 s.c om * @param itemset itemset to be added */ public void addItemset(Instance instance) { context.addItemset(toItemset(new SparseInstance(instance))); }
From source file:moa.streams.generators.multilabel.MetaMultilabelGenerator.java
License:Open Source License
/** * GenerateMLInstance./*from w ww .ja va 2 s . c o m*/ * * @param Y a set of label [indices] * @return a multit-labelled example */ private Instance generateMLInstance(HashSet<Integer> Y) { // create a multi-label instance: Instance x_ml = new SparseInstance(this.multilabelStreamTemplate.numAttributes()); x_ml.setDataset(this.multilabelStreamTemplate); // set classes for (int j = 0; j < m_L; j++) { x_ml.setValue(j, 0.0); } for (int l : Y) { x_ml.setValue(l, 1.0); } // generate binary instances Instance x_0 = getNextWithBinary(0); Instance x_1 = getNextWithBinary(1); // Loop through each feature attribute @warning: assumes class is last index for (int a = 0; a < m_A; a++) { // The combination is present: use a positive value if (Y.containsAll(m_TopCombinations[a])) { x_ml.setValue(m_L + a, x_1.value(a)); //x_ml.setValue(m_L+a,1.0); } // The combination is absent: use a negative value else { x_ml.setValue(m_L + a, x_0.value(a)); //x_ml.setValue(m_L+a,0.0); } } return x_ml; }
From source file:mulan.classifier.transformation.CalibratedLabelRanking.java
License:Open Source License
@Override protected void buildInternal(MultiLabelInstances trainingSet) throws Exception { // Virtual label models debug("Building calibration label models"); System.out.println("Building calibration label models"); virtualLabelModels = new BinaryRelevance(getBaseClassifier()); virtualLabelModels.setDebug(getDebug()); virtualLabelModels.build(trainingSet); // One-vs-one models numModels = ((numLabels) * (numLabels - 1)) / 2; oneVsOneModels = AbstractClassifier.makeCopies(getBaseClassifier(), numModels); nodata = new boolean[numModels]; metaDataTest = new Instances[numModels]; Instances trainingData = trainingSet.getDataSet(); int counter = 0; // Creation of one-vs-one models for (int label1 = 0; label1 < numLabels - 1; label1++) { // Attribute of label 1 Attribute attrLabel1 = trainingData.attribute(labelIndices[label1]); for (int label2 = label1 + 1; label2 < numLabels; label2++) { debug("Building one-vs-one model " + (counter + 1) + "/" + numModels); System.out.println("Building one-vs-one model " + (counter + 1) + "/" + numModels); // Attribute of label 2 Attribute attrLabel2 = trainingData.attribute(labelIndices[label2]); // initialize training set Instances dataOneVsOne = new Instances(trainingData, 0); // filter out examples with no preference for (int i = 0; i < trainingData.numInstances(); i++) { Instance tempInstance;//ww w.ja va2 s . com if (trainingData.instance(i) instanceof SparseInstance) { tempInstance = new SparseInstance(trainingData.instance(i)); } else { tempInstance = new DenseInstance(trainingData.instance(i)); } int nominalValueIndex; nominalValueIndex = (int) tempInstance.value(labelIndices[label1]); String value1 = attrLabel1.value(nominalValueIndex); nominalValueIndex = (int) tempInstance.value(labelIndices[label2]); String value2 = attrLabel2.value(nominalValueIndex); if (!value1.equals(value2)) { tempInstance.setValue(attrLabel1, value1); dataOneVsOne.add(tempInstance); } } // remove all labels apart from label1 and place it at the end Reorder filter = new Reorder(); int numPredictors = trainingData.numAttributes() - numLabels; int[] reorderedIndices = new int[numPredictors + 1]; for (int i = 0; i < numPredictors; i++) { reorderedIndices[i] = featureIndices[i]; } reorderedIndices[numPredictors] = labelIndices[label1]; filter.setAttributeIndicesArray(reorderedIndices); filter.setInputFormat(dataOneVsOne); dataOneVsOne = Filter.useFilter(dataOneVsOne, filter); //System.out.println(dataOneVsOne.toString()); dataOneVsOne.setClassIndex(numPredictors); // build model label1 vs label2 if (dataOneVsOne.size() > 0) { oneVsOneModels[counter].buildClassifier(dataOneVsOne); } else { nodata[counter] = true; } dataOneVsOne.delete(); metaDataTest[counter] = dataOneVsOne; counter++; } } }