Example usage for weka.core SparseInstance setValue

List of usage examples for weka.core SparseInstance setValue

Introduction

In this page you can find the example usage for weka.core SparseInstance setValue.

Prototype

@Override
public void setValue(int attIndex, double value) 

Source Link

Document

Sets a specific value in the instance to the given value (internal floating-point format).

Usage

From source file:edu.cmu.lti.oaqa.baseqa.providers.ml.classifiers.MekaProvider.java

License:Apache License

@Override
public void train(List<Map<String, Double>> X, List<String> Y, boolean crossValidation)
        throws AnalysisEngineProcessException {
    // create attribute (including label) info
    ArrayList<Attribute> attributes = new ArrayList<>();
    List<String> labelNames = ClassifierProvider.labelNames(Y);
    labelNames.stream().map(attr -> new Attribute(attr, Arrays.asList("y", "n")))
            .forEachOrdered(attributes::add);
    List<String> featureNames = ClassifierProvider.featureNames(X);
    featureNames.stream().map(Attribute::new).forEachOrdered(attributes::add);
    String name = Files.getNameWithoutExtension(modelFile.getName());
    datasetSchema = new Instances(name, attributes, 0);
    datasetSchema.setClassIndex(labelNames.size());
    // add instances
    // due to the limitation of the interface definition, X, Y should be reorganized
    SetMultimap<Map<String, Double>, String> XY = HashMultimap.create();
    IntStream.range(0, X.size()).forEach(i -> XY.put(X.get(i), Y.get(i)));
    Instances trainingInstances = new Instances(datasetSchema, XY.size());
    for (Map.Entry<Map<String, Double>, Collection<String>> entry : XY.asMap().entrySet()) {
        Set<String> y = ImmutableSet.copyOf(entry.getValue());
        Map<String, Double> x = entry.getKey();
        SparseInstance instance = new SparseInstance(labelNames.size() + x.size());
        for (String labelName : labelNames) {
            instance.setValue(datasetSchema.attribute(labelName), y.contains(labelName) ? "y" : "n");
        }//from   w ww  . j a va  2s .  c  om
        for (Map.Entry<String, Double> e : x.entrySet()) {
            instance.setValue(datasetSchema.attribute(e.getKey()), e.getValue());
        }
        trainingInstances.add(instance);
    }
    // training
    try {
        classifier = (MultiLabelClassifier) AbstractClassifier.forName(classifierName, options);
        classifier.buildClassifier(trainingInstances);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
    try {
        SerializationHelper.write(modelFile.getAbsolutePath(), classifier);
        SerializationHelper.write(datasetSchemaFile.getAbsolutePath(), datasetSchema);
    } catch (Exception e) {
        throw new AnalysisEngineProcessException(e);
    }
    if (crossValidation) {
        try {
            Evaluation eval = new Evaluation(trainingInstances);
            Random rand = new Random();
            eval.crossValidateModel(classifier, trainingInstances, 10, rand);
            LOG.debug(eval.toSummaryString());
        } catch (Exception e) {
            throw new AnalysisEngineProcessException(e);
        }
    }
}

From source file:europarl.PhraseTranslation.java

License:Open Source License

public boolean getFromGz(String fileName, String targetWord, int limit) {
    String strLine;//from   w  ww.  j  a v a 2  s  .  c o  m
    ArrayList<String> line_triple = new ArrayList<String>();

    BufferedReader gzipReader;
    Pattern word_align = Pattern.compile("(\\w+) \\(\\{(.*?)\\}\\) ");

    Bag<String> words_list = new Bag<String>(); //Set of ALL words: it will be the list of attributes
    ArrayList<PhraseTranslation> translations = new ArrayList<PhraseTranslation>();
    try {
        gzipReader = new BufferedReader(
                new InputStreamReader(new GZIPInputStream(new FileInputStream(fileName))));

        while ((strLine = gzipReader.readLine()) != null) //read-everything
        {
            line_triple.add(strLine);
            if (line_triple.size() == 3) //triple finished
            {
                //TODO: match only complete words
                //TODO: stem it before doing this

                Matcher matcher = word_align.matcher(line_triple.get(2));
                String[] foreign_words = line_triple.get(1).split(" ");
                line_triple.clear();
                if (!strLine.contains(targetWord)) //skip it
                    continue;

                ArrayList<String> e_phrase = new ArrayList<String>();
                String translation = "";
                while (matcher.find()) //each iteration is word +alignment
                {
                    assert matcher.groupCount() == 2;
                    String e_word = matcher.group(1).trim();
                    if (e_word.equals("NULL"))
                        e_word = "";
                    if (stopwordsList.contains(e_word))
                        continue;
                    if (stemmer != null)
                        e_word = stemmer.stem(e_word);

                    e_phrase.add(e_word);
                    words_list.add(e_word);

                    //we don't care about the alignment of non-target words
                    if (!e_word.equals(targetWord))
                        continue;

                    //parse the { x y z } alignment part
                    ArrayList<Integer> f_words = new ArrayList<Integer>();
                    translation = "";
                    //for each number between curly brackets
                    for (String number : matcher.group(2).split(" ")) {
                        if (!number.isEmpty()) {
                            int n_word = Integer.parseInt(number) - 1;
                            f_words.add(n_word);
                            translation += foreign_words[n_word] + " ";
                        }
                    } // end of curly brackets for

                } //end of word+alignment while
                if (!translation.isEmpty()) {
                    PhraseTranslation trans = new PhraseTranslation(e_phrase, translation);
                    translations.add(trans);
                }
                line_triple.clear();
            } //end of triple-finished if
            if (translations.size() == limit)
                break; //stop collecting!
        } //end of the read-everything while
    } catch (Exception e) {
        log.error("Error: " + e);
        e.printStackTrace();
        return false;
    }

    //what we NOW have: a set of attributes in HashSet<String>words_list
    //a ArrayList<PhraseTranslation> translations      
    log.info("Collected " + translations.size() + " phrases and " + words_list.size() + " words");

    postProcessData(translations, words_list);

    //now convert the data we collected to Weka data
    //we needed to do "double passing" because we need to initialize
    //the dataset with the complete list of attributes

    //this will convert word to attributes: they are all "boolean"
    ArrayList<Attribute> attrs = new ArrayList<Attribute>();
    HashMap<String, Attribute> attrs_map = new HashMap<String, Attribute>();
    Attribute att;
    for (String word : words_list) {
        att = new Attribute(word);
        attrs.add(att);
        attrs_map.put(word, att);
    }

    //now we need to manage class.
    //each translation is a class, so we need to get all of them
    HashMap<String, Integer> class_map = new HashMap<String, Integer>();
    ArrayList<String> classes = new ArrayList<String>();
    for (PhraseTranslation phraseTranslation : translations) {
        if (!class_map.containsKey(phraseTranslation.getTranslatedWord())) {
            class_map.put(phraseTranslation.getTranslatedWord(), classes.size());
            classes.add(phraseTranslation.getTranslatedWord());
        }
    }

    log.info(targetWord + " has " + classes.size() + " translations:");
    if (log.isInfoEnabled())
        for (String translation : classes)
            System.out.println(translation);
    att = new Attribute("%class", classes);
    attrs.add(att);
    attrs_map.put("%class", att);
    dataSet = new Instances("dataset", attrs, 0);
    for (PhraseTranslation phraseTranslation : translations) {
        SparseInstance inst = new SparseInstance(attrs.size());
        //set everything to 0
        for (int i = 0; i < attrs.size(); i++)
            inst.setValue(i, 0);
        //set present word to 1
        for (String word : phraseTranslation.getPhraseWords())
            inst.setValue(attrs_map.get(word), 1);
        //set class of instance
        inst.setValue(attrs_map.get("%class"), class_map.get(phraseTranslation.getTranslatedWord()));
        dataSet.add(inst);
    }

    return true;
}

From source file:mulan.classifier.transformation.TwoStageClassifierChainArchitecture.java

License:Open Source License

private Instance modifySparseInstance(Instance instance, double[] confidences) {
    SparseInstance modifiedIns = new SparseInstance(instance);
    for (int i = confidences.length - 1; i >= 0; i--) {
        modifiedIns.insertAttributeAt(0);
        modifiedIns.setValue(0, confidences[i]);
    }/*from   ww w  .  j a  va2 s  .  com*/
    return modifiedIns;
}

From source file:mulan.classifier.transformation.TwoStagePrunedClassifierChainArchitecture.java

License:Open Source License

private Instance modifySparseInstance(Instance ins, double value1, double value2) {
    SparseInstance modifiedIns = new SparseInstance(ins);
    modifiedIns.insertAttributeAt(0);//from w w w  .  j  av a 2 s.  c  o  m
    modifiedIns.setValue(0, value1);
    modifiedIns.insertAttributeAt(0);
    modifiedIns.setValue(0, value2);
    return modifiedIns;
}

From source file:predictors.HelixIndexer.java

License:Open Source License

/**
 * Converts a given window into a Weka Instance.
 * //from   w  ww  .  j  a  v  a 2  s  .  com
 * @param pssm
 * @param windowCenter
 * @return
 */
private Instance buildInstance(Pssm pssm, int windowCenter) {
    SparseInstance window = new SparseInstance(this.attributes.size());
    int windowStart = windowCenter - Globals.INDEXER_WINDOW_SIZE;
    int windowStop = windowCenter + Globals.INDEXER_WINDOW_SIZE;
    int globalLenght = pssm.getLength();
    int nTermDist = windowCenter + 1;
    int cTermDist = globalLenght - windowCenter;
    int attIndex = 0;
    int conserved = 0;
    int nonConserved = 0;
    double consAvgHydro = 0;
    double nonConsAvgHydro = 0;
    double consHydro = 0;
    double nonConsHydro = 0;
    double consPCharged = 0;
    double nonConsPCharged = 0;
    double consNCharged = 0;
    double nonConsNCharged = 0;
    double consPolar = 0;
    double nonConsPolar = 0;

    //amino acid at position i in window
    for (int i = windowStart; i <= windowStop; ++i) {
        if (i >= 0 && i < globalLenght) {
            for (int j = 0; j < 20; ++j) {
                int score = pssm.getScore(i, j);

                if (Math.abs(i - windowCenter) <= Globals.INDEXER_INNER_WINDOW_SIZE) {
                    char aa = Mappings.intToAa(j);

                    if (score > 0) {
                        consAvgHydro += Mappings.hydrophobicity(aa);

                        if (Mappings.hydrophobicity(aa) > 0) {
                            ++consHydro;
                        }
                        if (Mappings.charge(aa) > 0) {
                            ++consPCharged;
                        }
                        if (Mappings.charge(aa) < 0) {
                            ++consNCharged;
                        }
                        if (Mappings.polarity(aa) > 0) {
                            ++consPolar;
                        }

                        ++conserved;
                    } else if (score < 0) {
                        nonConsAvgHydro += Mappings.hydrophobicity(aa);

                        if (Mappings.hydrophobicity(aa) > 0) {
                            ++nonConsHydro;
                        }
                        if (Mappings.charge(aa) > 0) {
                            ++nonConsPCharged;
                        }
                        if (Mappings.charge(aa) < 0) {
                            ++nonConsNCharged;
                        }
                        if (Mappings.polarity(aa) > 0) {
                            ++nonConsPolar;
                        }

                        ++nonConserved;
                    }
                }

                window.setValue((Attribute) this.attributes.get(attIndex++), score);
            }

            window.setValue((Attribute) this.attributes.get(attIndex++), -10);
        } else {
            for (int j = 0; j < 20; ++j) {
                window.setValue((Attribute) this.attributes.get(attIndex++), 0);
            }

            window.setValue((Attribute) this.attributes.get(attIndex++), 10);
        }
    }

    conserved = Math.max(conserved, 1);
    nonConserved = Math.max(nonConserved, 1);

    window.setValue((Attribute) this.attributes.get(attIndex++), consAvgHydro / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsAvgHydro / nonConserved);

    window.setValue((Attribute) this.attributes.get(attIndex++), consHydro / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsHydro / nonConserved);

    window.setValue((Attribute) this.attributes.get(attIndex++), consPCharged / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsPCharged / nonConserved);

    window.setValue((Attribute) this.attributes.get(attIndex++), consNCharged / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsNCharged / nonConserved);

    window.setValue((Attribute) this.attributes.get(attIndex++), consPolar / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsPolar / nonConserved);

    if (nTermDist > 40) {
        nTermDist = 4;
    } else if (nTermDist > 30) {
        nTermDist = 3;
    } else if (nTermDist > 20) {
        nTermDist = 2;
    } else if (nTermDist > 10) {
        nTermDist = 1;
    } else {
        nTermDist = 0;
    }

    if (cTermDist > 40) {
        cTermDist = 4;
    } else if (cTermDist > 30) {
        cTermDist = 3;
    } else if (cTermDist > 20) {
        cTermDist = 2;
    } else if (cTermDist > 10) {
        cTermDist = 1;
    } else {
        cTermDist = 0;
    }

    if (globalLenght > 240) {
        globalLenght = 4;
    } else if (globalLenght > 180) {
        globalLenght = 3;
    } else if (globalLenght > 120) {
        globalLenght = 2;
    } else if (globalLenght > 60) {
        globalLenght = 1;
    } else {
        globalLenght = 0;
    }

    window.setValue((Attribute) this.attributes.get(attIndex++), nTermDist);
    window.setValue((Attribute) this.attributes.get(attIndex++), cTermDist);
    window.setValue((Attribute) this.attributes.get(attIndex++), globalLenght);

    for (int i = 0; i < 20; ++i) {
        window.setValue((Attribute) this.attributes.get(attIndex++), this.globalConsAa[i]);
        window.setValue((Attribute) this.attributes.get(attIndex++), this.globalNonConsAa[i]);
    }

    return window;
}

From source file:predictors.HelixPredictor.java

License:Open Source License

/**
 * Converts a given segment (TMH or not) into a Weka Instance.
 * /*from   w ww  .  j av  a2s  . c om*/
 * @param pssm
 * @param start
 * @param end
 * @return
 */
private Instance buildInstance(Pssm pssm, int start, int end) {
    SparseInstance window = new SparseInstance(this.attributes.size());
    int length = end - start + 1;
    int attIndex = 0;
    int conserved = 0;
    int nonConserved = 0;
    double consAvgHydro = 0;
    double nonConsAvgHydro = 0;
    double consHydro = 0;
    double nonConsHydro = 0;
    double consCharged = 0;
    double nonConsCharged = 0;
    double[] consAaComp = new double[20];
    double[] nonConsAaComp = new double[20];

    //amino acid composition, hydrophobicity, and charge
    for (int i = start; i <= end; ++i) {
        for (int j = 0; j < 20; ++j) {
            int score = pssm.getScore(i, j);
            char aa = Mappings.intToAa(j);

            if (score > 0) {
                consAvgHydro += Mappings.hydrophobicity(aa);

                if (Mappings.hydrophobicity(aa) > 0) {
                    ++consHydro;
                }
                if (Mappings.charge(aa) != 0) {
                    ++consCharged;
                }

                ++consAaComp[j];
                ++conserved;
            } else if (score < 0) {
                nonConsAvgHydro += Mappings.hydrophobicity(aa);

                if (Mappings.hydrophobicity(aa) > 0) {
                    ++nonConsHydro;
                }
                if (Mappings.charge(aa) != 0) {
                    ++nonConsCharged;
                }

                ++nonConsAaComp[j];
                ++nonConserved;
            }
        }
    }

    conserved = Math.max(conserved, 1);
    nonConserved = Math.max(nonConserved, 1);

    for (int i = 0; i < consAaComp.length; ++i) {
        consAaComp[i] = consAaComp[i] / conserved;

        window.setValue((Attribute) this.attributes.get(attIndex++), consAaComp[i]);
    }

    for (int i = 0; i < nonConsAaComp.length; ++i) {
        nonConsAaComp[i] = nonConsAaComp[i] / nonConserved;

        window.setValue((Attribute) this.attributes.get(attIndex++), nonConsAaComp[i]);
    }

    window.setValue((Attribute) this.attributes.get(attIndex++), length);

    window.setValue((Attribute) this.attributes.get(attIndex++), consAvgHydro / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsAvgHydro / nonConserved);

    window.setValue((Attribute) this.attributes.get(attIndex++), consHydro / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsHydro / nonConserved);

    window.setValue((Attribute) this.attributes.get(attIndex++), consCharged / conserved);
    window.setValue((Attribute) this.attributes.get(attIndex++), nonConsCharged / nonConserved);

    return window;
}

From source file:predictors.TopologyPredictor.java

License:Open Source License

/**
 * Converts a list of segments for both sides of the membrane into a WEKA instance.
 * // w w  w. ja  v a2s.c  om
 * @param pssm
 * @param structure
 * @param segments
 * @param startPos
 * @return
 */
private Instance buildInstance(Pssm pssm, char[] structure, ArrayList<Segment> segments, int startPos) {
    SparseInstance protein = new SparseInstance(this.attributes.size());
    double[] consComposition1 = new double[20];
    double[] nonConsComposition1 = new double[20];
    double[] consComposition2 = new double[20];
    double[] nonConsComposition2 = new double[20];
    int attIndex = 0;
    int conserved1 = 0;
    int nonConserved1 = 0;
    int conserved2 = 0;
    int nonConserved2 = 0;
    int consPosCharged1 = 0;
    int nonConsPosCharged1 = 0;
    int conPosCharged2 = 0;
    int nonConsPosCharged2 = 0;
    int firstSide = -1;

    for (Segment segment : segments) {
        if (segment.end < startPos) {
            continue;
        }
        if (firstSide == -1) {
            firstSide = segment.type;
        }

        int start = Math.max(segment.start, startPos);
        int end = segment.end;
        int side = segment.type;

        for (int i = start; i <= end; ++i) {
            if (Mappings.ssToInt(structure[i]) != Mappings.indexUnknown) {
                for (int j = 0; j < 20; ++j) {
                    int score = pssm.getScore(i, j);
                    char aa = Mappings.intToAa(j);

                    if (score > 0) {
                        if (side == firstSide) {
                            ++consComposition1[j];
                            ++conserved1;

                            if (Mappings.charge(aa) > 0) {
                                ++consPosCharged1;
                            }
                        } else {
                            ++consComposition2[j];
                            ++conserved2;

                            if (Mappings.charge(aa) > 0) {
                                ++conPosCharged2;
                            }
                        }
                    } else if (score < 0) {
                        if (side == firstSide) {
                            ++nonConsComposition1[j];
                            ++nonConserved1;

                            if (Mappings.charge(aa) > 0) {
                                ++nonConsPosCharged1;
                            }
                        } else {
                            ++nonConsComposition2[j];
                            ++nonConserved2;

                            if (Mappings.charge(aa) > 0) {
                                ++nonConsPosCharged2;
                            }
                        }
                    }
                }
            }
        }
    }

    conserved1 = Math.max(conserved1, 1);
    nonConserved1 = Math.max(nonConserved1, 1);
    conserved2 = Math.max(conserved2, 1);
    nonConserved2 = Math.max(nonConserved2, 1);

    //normalize for length
    for (int i = 0; i < consComposition1.length; ++i) {
        consComposition1[i] = consComposition1[i] / conserved1;

        protein.setValue((Attribute) this.attributes.get(attIndex++), consComposition1[i]);
    }

    for (int i = 0; i < nonConsComposition1.length; ++i) {
        nonConsComposition1[i] = nonConsComposition1[i] / nonConserved1;

        protein.setValue((Attribute) this.attributes.get(attIndex++), nonConsComposition1[i]);
    }

    for (int i = 0; i < consComposition2.length; ++i) {
        consComposition2[i] = consComposition2[i] / conserved2;

        protein.setValue((Attribute) this.attributes.get(attIndex++), consComposition2[i]);
    }

    for (int i = 0; i < nonConsComposition2.length; ++i) {
        nonConsComposition2[i] = nonConsComposition2[i] / nonConserved2;

        protein.setValue((Attribute) this.attributes.get(attIndex++), nonConsComposition2[i]);
    }

    protein.setValue((Attribute) this.attributes.get(attIndex++),
            (double) consPosCharged1 / (double) conserved1);
    protein.setValue((Attribute) this.attributes.get(attIndex++),
            (double) nonConsPosCharged1 / (double) nonConserved1);

    protein.setValue((Attribute) this.attributes.get(attIndex++),
            (double) conPosCharged2 / (double) conserved2);
    protein.setValue((Attribute) this.attributes.get(attIndex++),
            (double) nonConsPosCharged2 / (double) nonConserved2);

    protein.setValue((Attribute) this.attributes.get(attIndex++), (consPosCharged1 - conPosCharged2));
    protein.setValue((Attribute) this.attributes.get(attIndex++), (nonConsPosCharged1 - nonConsPosCharged2));

    return protein;
}