List of usage examples for weka.core SparseInstance setValue
@Override public void setValue(int attIndex, double value)
From source file:edu.cmu.lti.oaqa.baseqa.providers.ml.classifiers.MekaProvider.java
License:Apache License
@Override public void train(List<Map<String, Double>> X, List<String> Y, boolean crossValidation) throws AnalysisEngineProcessException { // create attribute (including label) info ArrayList<Attribute> attributes = new ArrayList<>(); List<String> labelNames = ClassifierProvider.labelNames(Y); labelNames.stream().map(attr -> new Attribute(attr, Arrays.asList("y", "n"))) .forEachOrdered(attributes::add); List<String> featureNames = ClassifierProvider.featureNames(X); featureNames.stream().map(Attribute::new).forEachOrdered(attributes::add); String name = Files.getNameWithoutExtension(modelFile.getName()); datasetSchema = new Instances(name, attributes, 0); datasetSchema.setClassIndex(labelNames.size()); // add instances // due to the limitation of the interface definition, X, Y should be reorganized SetMultimap<Map<String, Double>, String> XY = HashMultimap.create(); IntStream.range(0, X.size()).forEach(i -> XY.put(X.get(i), Y.get(i))); Instances trainingInstances = new Instances(datasetSchema, XY.size()); for (Map.Entry<Map<String, Double>, Collection<String>> entry : XY.asMap().entrySet()) { Set<String> y = ImmutableSet.copyOf(entry.getValue()); Map<String, Double> x = entry.getKey(); SparseInstance instance = new SparseInstance(labelNames.size() + x.size()); for (String labelName : labelNames) { instance.setValue(datasetSchema.attribute(labelName), y.contains(labelName) ? "y" : "n"); }//from w ww . j a va 2s . c om for (Map.Entry<String, Double> e : x.entrySet()) { instance.setValue(datasetSchema.attribute(e.getKey()), e.getValue()); } trainingInstances.add(instance); } // training try { classifier = (MultiLabelClassifier) AbstractClassifier.forName(classifierName, options); classifier.buildClassifier(trainingInstances); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } try { SerializationHelper.write(modelFile.getAbsolutePath(), classifier); SerializationHelper.write(datasetSchemaFile.getAbsolutePath(), datasetSchema); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } if (crossValidation) { try { Evaluation eval = new Evaluation(trainingInstances); Random rand = new Random(); eval.crossValidateModel(classifier, trainingInstances, 10, rand); LOG.debug(eval.toSummaryString()); } catch (Exception e) { throw new AnalysisEngineProcessException(e); } } }
From source file:europarl.PhraseTranslation.java
License:Open Source License
public boolean getFromGz(String fileName, String targetWord, int limit) { String strLine;//from w ww. j a v a 2 s . c o m ArrayList<String> line_triple = new ArrayList<String>(); BufferedReader gzipReader; Pattern word_align = Pattern.compile("(\\w+) \\(\\{(.*?)\\}\\) "); Bag<String> words_list = new Bag<String>(); //Set of ALL words: it will be the list of attributes ArrayList<PhraseTranslation> translations = new ArrayList<PhraseTranslation>(); try { gzipReader = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(fileName)))); while ((strLine = gzipReader.readLine()) != null) //read-everything { line_triple.add(strLine); if (line_triple.size() == 3) //triple finished { //TODO: match only complete words //TODO: stem it before doing this Matcher matcher = word_align.matcher(line_triple.get(2)); String[] foreign_words = line_triple.get(1).split(" "); line_triple.clear(); if (!strLine.contains(targetWord)) //skip it continue; ArrayList<String> e_phrase = new ArrayList<String>(); String translation = ""; while (matcher.find()) //each iteration is word +alignment { assert matcher.groupCount() == 2; String e_word = matcher.group(1).trim(); if (e_word.equals("NULL")) e_word = ""; if (stopwordsList.contains(e_word)) continue; if (stemmer != null) e_word = stemmer.stem(e_word); e_phrase.add(e_word); words_list.add(e_word); //we don't care about the alignment of non-target words if (!e_word.equals(targetWord)) continue; //parse the { x y z } alignment part ArrayList<Integer> f_words = new ArrayList<Integer>(); translation = ""; //for each number between curly brackets for (String number : matcher.group(2).split(" ")) { if (!number.isEmpty()) { int n_word = Integer.parseInt(number) - 1; f_words.add(n_word); translation += foreign_words[n_word] + " "; } } // end of curly brackets for } //end of word+alignment while if (!translation.isEmpty()) { PhraseTranslation trans = new PhraseTranslation(e_phrase, translation); translations.add(trans); } line_triple.clear(); } //end of triple-finished if if (translations.size() == limit) break; //stop collecting! } //end of the read-everything while } catch (Exception e) { log.error("Error: " + e); e.printStackTrace(); return false; } //what we NOW have: a set of attributes in HashSet<String>words_list //a ArrayList<PhraseTranslation> translations log.info("Collected " + translations.size() + " phrases and " + words_list.size() + " words"); postProcessData(translations, words_list); //now convert the data we collected to Weka data //we needed to do "double passing" because we need to initialize //the dataset with the complete list of attributes //this will convert word to attributes: they are all "boolean" ArrayList<Attribute> attrs = new ArrayList<Attribute>(); HashMap<String, Attribute> attrs_map = new HashMap<String, Attribute>(); Attribute att; for (String word : words_list) { att = new Attribute(word); attrs.add(att); attrs_map.put(word, att); } //now we need to manage class. //each translation is a class, so we need to get all of them HashMap<String, Integer> class_map = new HashMap<String, Integer>(); ArrayList<String> classes = new ArrayList<String>(); for (PhraseTranslation phraseTranslation : translations) { if (!class_map.containsKey(phraseTranslation.getTranslatedWord())) { class_map.put(phraseTranslation.getTranslatedWord(), classes.size()); classes.add(phraseTranslation.getTranslatedWord()); } } log.info(targetWord + " has " + classes.size() + " translations:"); if (log.isInfoEnabled()) for (String translation : classes) System.out.println(translation); att = new Attribute("%class", classes); attrs.add(att); attrs_map.put("%class", att); dataSet = new Instances("dataset", attrs, 0); for (PhraseTranslation phraseTranslation : translations) { SparseInstance inst = new SparseInstance(attrs.size()); //set everything to 0 for (int i = 0; i < attrs.size(); i++) inst.setValue(i, 0); //set present word to 1 for (String word : phraseTranslation.getPhraseWords()) inst.setValue(attrs_map.get(word), 1); //set class of instance inst.setValue(attrs_map.get("%class"), class_map.get(phraseTranslation.getTranslatedWord())); dataSet.add(inst); } return true; }
From source file:mulan.classifier.transformation.TwoStageClassifierChainArchitecture.java
License:Open Source License
private Instance modifySparseInstance(Instance instance, double[] confidences) { SparseInstance modifiedIns = new SparseInstance(instance); for (int i = confidences.length - 1; i >= 0; i--) { modifiedIns.insertAttributeAt(0); modifiedIns.setValue(0, confidences[i]); }/*from ww w . j a va2 s . com*/ return modifiedIns; }
From source file:mulan.classifier.transformation.TwoStagePrunedClassifierChainArchitecture.java
License:Open Source License
private Instance modifySparseInstance(Instance ins, double value1, double value2) { SparseInstance modifiedIns = new SparseInstance(ins); modifiedIns.insertAttributeAt(0);//from w w w . j av a 2 s. c o m modifiedIns.setValue(0, value1); modifiedIns.insertAttributeAt(0); modifiedIns.setValue(0, value2); return modifiedIns; }
From source file:predictors.HelixIndexer.java
License:Open Source License
/** * Converts a given window into a Weka Instance. * //from w ww . j a v a 2 s . com * @param pssm * @param windowCenter * @return */ private Instance buildInstance(Pssm pssm, int windowCenter) { SparseInstance window = new SparseInstance(this.attributes.size()); int windowStart = windowCenter - Globals.INDEXER_WINDOW_SIZE; int windowStop = windowCenter + Globals.INDEXER_WINDOW_SIZE; int globalLenght = pssm.getLength(); int nTermDist = windowCenter + 1; int cTermDist = globalLenght - windowCenter; int attIndex = 0; int conserved = 0; int nonConserved = 0; double consAvgHydro = 0; double nonConsAvgHydro = 0; double consHydro = 0; double nonConsHydro = 0; double consPCharged = 0; double nonConsPCharged = 0; double consNCharged = 0; double nonConsNCharged = 0; double consPolar = 0; double nonConsPolar = 0; //amino acid at position i in window for (int i = windowStart; i <= windowStop; ++i) { if (i >= 0 && i < globalLenght) { for (int j = 0; j < 20; ++j) { int score = pssm.getScore(i, j); if (Math.abs(i - windowCenter) <= Globals.INDEXER_INNER_WINDOW_SIZE) { char aa = Mappings.intToAa(j); if (score > 0) { consAvgHydro += Mappings.hydrophobicity(aa); if (Mappings.hydrophobicity(aa) > 0) { ++consHydro; } if (Mappings.charge(aa) > 0) { ++consPCharged; } if (Mappings.charge(aa) < 0) { ++consNCharged; } if (Mappings.polarity(aa) > 0) { ++consPolar; } ++conserved; } else if (score < 0) { nonConsAvgHydro += Mappings.hydrophobicity(aa); if (Mappings.hydrophobicity(aa) > 0) { ++nonConsHydro; } if (Mappings.charge(aa) > 0) { ++nonConsPCharged; } if (Mappings.charge(aa) < 0) { ++nonConsNCharged; } if (Mappings.polarity(aa) > 0) { ++nonConsPolar; } ++nonConserved; } } window.setValue((Attribute) this.attributes.get(attIndex++), score); } window.setValue((Attribute) this.attributes.get(attIndex++), -10); } else { for (int j = 0; j < 20; ++j) { window.setValue((Attribute) this.attributes.get(attIndex++), 0); } window.setValue((Attribute) this.attributes.get(attIndex++), 10); } } conserved = Math.max(conserved, 1); nonConserved = Math.max(nonConserved, 1); window.setValue((Attribute) this.attributes.get(attIndex++), consAvgHydro / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsAvgHydro / nonConserved); window.setValue((Attribute) this.attributes.get(attIndex++), consHydro / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsHydro / nonConserved); window.setValue((Attribute) this.attributes.get(attIndex++), consPCharged / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsPCharged / nonConserved); window.setValue((Attribute) this.attributes.get(attIndex++), consNCharged / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsNCharged / nonConserved); window.setValue((Attribute) this.attributes.get(attIndex++), consPolar / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsPolar / nonConserved); if (nTermDist > 40) { nTermDist = 4; } else if (nTermDist > 30) { nTermDist = 3; } else if (nTermDist > 20) { nTermDist = 2; } else if (nTermDist > 10) { nTermDist = 1; } else { nTermDist = 0; } if (cTermDist > 40) { cTermDist = 4; } else if (cTermDist > 30) { cTermDist = 3; } else if (cTermDist > 20) { cTermDist = 2; } else if (cTermDist > 10) { cTermDist = 1; } else { cTermDist = 0; } if (globalLenght > 240) { globalLenght = 4; } else if (globalLenght > 180) { globalLenght = 3; } else if (globalLenght > 120) { globalLenght = 2; } else if (globalLenght > 60) { globalLenght = 1; } else { globalLenght = 0; } window.setValue((Attribute) this.attributes.get(attIndex++), nTermDist); window.setValue((Attribute) this.attributes.get(attIndex++), cTermDist); window.setValue((Attribute) this.attributes.get(attIndex++), globalLenght); for (int i = 0; i < 20; ++i) { window.setValue((Attribute) this.attributes.get(attIndex++), this.globalConsAa[i]); window.setValue((Attribute) this.attributes.get(attIndex++), this.globalNonConsAa[i]); } return window; }
From source file:predictors.HelixPredictor.java
License:Open Source License
/** * Converts a given segment (TMH or not) into a Weka Instance. * /*from w ww . j av a2s . c om*/ * @param pssm * @param start * @param end * @return */ private Instance buildInstance(Pssm pssm, int start, int end) { SparseInstance window = new SparseInstance(this.attributes.size()); int length = end - start + 1; int attIndex = 0; int conserved = 0; int nonConserved = 0; double consAvgHydro = 0; double nonConsAvgHydro = 0; double consHydro = 0; double nonConsHydro = 0; double consCharged = 0; double nonConsCharged = 0; double[] consAaComp = new double[20]; double[] nonConsAaComp = new double[20]; //amino acid composition, hydrophobicity, and charge for (int i = start; i <= end; ++i) { for (int j = 0; j < 20; ++j) { int score = pssm.getScore(i, j); char aa = Mappings.intToAa(j); if (score > 0) { consAvgHydro += Mappings.hydrophobicity(aa); if (Mappings.hydrophobicity(aa) > 0) { ++consHydro; } if (Mappings.charge(aa) != 0) { ++consCharged; } ++consAaComp[j]; ++conserved; } else if (score < 0) { nonConsAvgHydro += Mappings.hydrophobicity(aa); if (Mappings.hydrophobicity(aa) > 0) { ++nonConsHydro; } if (Mappings.charge(aa) != 0) { ++nonConsCharged; } ++nonConsAaComp[j]; ++nonConserved; } } } conserved = Math.max(conserved, 1); nonConserved = Math.max(nonConserved, 1); for (int i = 0; i < consAaComp.length; ++i) { consAaComp[i] = consAaComp[i] / conserved; window.setValue((Attribute) this.attributes.get(attIndex++), consAaComp[i]); } for (int i = 0; i < nonConsAaComp.length; ++i) { nonConsAaComp[i] = nonConsAaComp[i] / nonConserved; window.setValue((Attribute) this.attributes.get(attIndex++), nonConsAaComp[i]); } window.setValue((Attribute) this.attributes.get(attIndex++), length); window.setValue((Attribute) this.attributes.get(attIndex++), consAvgHydro / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsAvgHydro / nonConserved); window.setValue((Attribute) this.attributes.get(attIndex++), consHydro / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsHydro / nonConserved); window.setValue((Attribute) this.attributes.get(attIndex++), consCharged / conserved); window.setValue((Attribute) this.attributes.get(attIndex++), nonConsCharged / nonConserved); return window; }
From source file:predictors.TopologyPredictor.java
License:Open Source License
/** * Converts a list of segments for both sides of the membrane into a WEKA instance. * // w w w. ja v a2s.c om * @param pssm * @param structure * @param segments * @param startPos * @return */ private Instance buildInstance(Pssm pssm, char[] structure, ArrayList<Segment> segments, int startPos) { SparseInstance protein = new SparseInstance(this.attributes.size()); double[] consComposition1 = new double[20]; double[] nonConsComposition1 = new double[20]; double[] consComposition2 = new double[20]; double[] nonConsComposition2 = new double[20]; int attIndex = 0; int conserved1 = 0; int nonConserved1 = 0; int conserved2 = 0; int nonConserved2 = 0; int consPosCharged1 = 0; int nonConsPosCharged1 = 0; int conPosCharged2 = 0; int nonConsPosCharged2 = 0; int firstSide = -1; for (Segment segment : segments) { if (segment.end < startPos) { continue; } if (firstSide == -1) { firstSide = segment.type; } int start = Math.max(segment.start, startPos); int end = segment.end; int side = segment.type; for (int i = start; i <= end; ++i) { if (Mappings.ssToInt(structure[i]) != Mappings.indexUnknown) { for (int j = 0; j < 20; ++j) { int score = pssm.getScore(i, j); char aa = Mappings.intToAa(j); if (score > 0) { if (side == firstSide) { ++consComposition1[j]; ++conserved1; if (Mappings.charge(aa) > 0) { ++consPosCharged1; } } else { ++consComposition2[j]; ++conserved2; if (Mappings.charge(aa) > 0) { ++conPosCharged2; } } } else if (score < 0) { if (side == firstSide) { ++nonConsComposition1[j]; ++nonConserved1; if (Mappings.charge(aa) > 0) { ++nonConsPosCharged1; } } else { ++nonConsComposition2[j]; ++nonConserved2; if (Mappings.charge(aa) > 0) { ++nonConsPosCharged2; } } } } } } } conserved1 = Math.max(conserved1, 1); nonConserved1 = Math.max(nonConserved1, 1); conserved2 = Math.max(conserved2, 1); nonConserved2 = Math.max(nonConserved2, 1); //normalize for length for (int i = 0; i < consComposition1.length; ++i) { consComposition1[i] = consComposition1[i] / conserved1; protein.setValue((Attribute) this.attributes.get(attIndex++), consComposition1[i]); } for (int i = 0; i < nonConsComposition1.length; ++i) { nonConsComposition1[i] = nonConsComposition1[i] / nonConserved1; protein.setValue((Attribute) this.attributes.get(attIndex++), nonConsComposition1[i]); } for (int i = 0; i < consComposition2.length; ++i) { consComposition2[i] = consComposition2[i] / conserved2; protein.setValue((Attribute) this.attributes.get(attIndex++), consComposition2[i]); } for (int i = 0; i < nonConsComposition2.length; ++i) { nonConsComposition2[i] = nonConsComposition2[i] / nonConserved2; protein.setValue((Attribute) this.attributes.get(attIndex++), nonConsComposition2[i]); } protein.setValue((Attribute) this.attributes.get(attIndex++), (double) consPosCharged1 / (double) conserved1); protein.setValue((Attribute) this.attributes.get(attIndex++), (double) nonConsPosCharged1 / (double) nonConserved1); protein.setValue((Attribute) this.attributes.get(attIndex++), (double) conPosCharged2 / (double) conserved2); protein.setValue((Attribute) this.attributes.get(attIndex++), (double) nonConsPosCharged2 / (double) nonConserved2); protein.setValue((Attribute) this.attributes.get(attIndex++), (consPosCharged1 - conPosCharged2)); protein.setValue((Attribute) this.attributes.get(attIndex++), (nonConsPosCharged1 - nonConsPosCharged2)); return protein; }