Example usage for weka.core Instances size

Introduction

In this page you can find the example usage for weka.core Instances size.

Prototype


@Override
publicint size()

Source Link

Document

Returns the number of instances in the dataset.

Usage

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

private int[] determineDictionary(Instances instances) {
    if (stringAttributeIndex < 0) {
        throw new IllegalStateException("String attribute index not valid");
    }/* w w  w.  j  a  va  2  s  . c  om*/

    // Operate on a per-class basis if class attribute is set
    int classInd = instances.classIndex();
    int values = 1;
    if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
        values = instances.attribute(classInd).numValues();
    }

    HashMap<String, Integer> termIndices = new HashMap<String, Integer>();
    for (int i = 0; i < termList.size(); i++) {
        termIndices.put(termList.get(i), i);
    }

    //Create the trie for matching terms
    Trie termTrie = new Trie(termList);

    //Initialize the dictionary/count map
    ArrayList<HashMap<Integer, Count>> termCounts = new ArrayList<HashMap<Integer, Count>>();
    for (int z = 0; z < values; z++) {
        termCounts.add(new HashMap<Integer, Count>());
    }

    //Go through all the instances and count the emoticons
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance instance = instances.instance(i);
        int vInd = 0;
        if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) {
            vInd = (int) instance.classValue();
        }

        //Get the string attribute to examine
        String stringValue = instance.stringValue(stringAttributeIndex);

        HashMap<Integer, Count> termCountsForClass = termCounts.get(vInd);

        HashMap<String, Integer> termMatches = termTrie.countNonoverlappingMatches(stringValue);
        for (Map.Entry<String, Integer> entry : termMatches.entrySet()) {
            String term = entry.getKey();
            int termIdx = termIndices.get(term);

            int matches = entry.getValue();

            Count count = termCountsForClass.get(termIdx);
            if (count == null) {
                count = new Count(0);
                termCountsForClass.put(termIdx, count);
            }

            if (matches > 0) {
                count.docCount += 1;
                count.count += matches;
            }
        }
    }

    // Figure out the minimum required word frequency
    int prune[] = new int[values];
    for (int z = 0; z < values; z++) {
        HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

        int array[] = new int[termCountsForClass.size()];
        int pos = 0;
        for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) {
            array[pos] = entry.getValue().count;
            pos++;
        }

        // sort the array
        sortArray(array);

        if (array.length < m_WordsToKeep) {
            // if there aren't enough words, set the threshold to
            // minFreq
            prune[z] = m_minTermFreq;
        } else {
            // otherwise set it to be at least minFreq
            prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]);
        }
    }

    // Add the word vector attributes (eliminating duplicates
    // that occur in multiple classes)
    HashSet<String> selectedTerms = new HashSet<String>();
    for (int z = 0; z < values; z++) {
        HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

        for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) {
            int termIndex = entry.getKey();
            String term = termList.get(termIndex);
            Count count = entry.getValue();
            if (count.count >= prune[z]) {
                selectedTerms.add(term);
            }
        }
    }

    //Save the selected terms as a list
    this.m_selectedTerms = new ArrayList<String>(selectedTerms);
    this.m_selectedTermsTrie = new Trie(this.m_selectedTerms);
    this.m_NumInstances = instances.size();

    //Construct the selected terms to index map
    this.m_selectedTermIndices = new HashMap<String, Integer>();
    for (int i = 0; i < m_selectedTerms.size(); i++) {
        m_selectedTermIndices.put(m_selectedTerms.get(i), i);
    }

    // Compute document frequencies, organized by selected term index (not original term index)
    int[] docsCounts = new int[m_selectedTerms.size()];
    for (int i = 0; i < m_selectedTerms.size(); i++) {
        String term = m_selectedTerms.get(i);
        int termIndex = termIndices.get(term);
        int docsCount = 0;
        for (int z = 0; z < values; z++) {
            HashMap<Integer, Count> termCountsForClass = termCounts.get(z);

            Count count = termCountsForClass.get(termIndex);
            if (count != null) {
                docsCount += count.docCount;
            }
        }
        docsCounts[i] = docsCount;
    }
    return docsCounts;
}

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

@Override
protected Instances process(Instances instances) throws Exception {
    Instances result = new Instances(getOutputFormat(), 0);

    // Convert all instances w/o normalization
    ArrayList<Instance> converted = new ArrayList<Instance>();
    ArrayList<Double> docLengths = new ArrayList<Double>();
    if (!isFirstBatchDone()) {
        m_AvgDocLength = 0;//from  w w  w . j  a  va 2 s.c o  m
    }
    for (int i = 0; i < instances.size(); i++) {
        double docLength = convertInstancewoDocNorm(instances.instance(i), converted);

        // Need to compute average document length if necessary
        if (m_filterType != FILTER_NONE) {
            if (!isFirstBatchDone()) {
                m_AvgDocLength += docLength;
            }
            docLengths.add(docLength);
        }
    }
    if (m_filterType != FILTER_NONE) {

        if (!isFirstBatchDone()) {
            m_AvgDocLength /= instances.size();
        }

        // Perform normalization if necessary.
        if (isFirstBatchDone() || (!isFirstBatchDone() && m_filterType == FILTER_NORMALIZE_ALL)) {
            for (int i = 0; i < converted.size(); i++) {
                normalizeInstance(converted.get(i), docLengths.get(i));
            }
        }
    }
    // Push all instances into the output queue
    for (int i = 0; i < converted.size(); i++) {
        result.add(converted.get(i));
    }

    return result;
}

From source file:etc.aloe.filters.StringToDictionaryVector.java

License:Open Source License

public static void main(String[] args) {

    //Create a test dataset
    ArrayList<Attribute> attributes = new ArrayList<Attribute>();
    attributes.add(new Attribute("message", (ArrayList<String>) null));
    attributes.add(new Attribute("id"));
    {/* ww w . j  a  v  a  2  s  .  com*/
        ArrayList<String> classValues = new ArrayList<String>();
        classValues.add("0");
        classValues.add("1");
        attributes.add(new Attribute("class", classValues));
    }

    Instances instances = new Instances("test", attributes, 0);
    instances.setClassIndex(2);

    String[] messages = new String[] { "No emoticons here", "I have a smiley :)",
            "Two smileys and a frownie :) :) :(", "Several emoticons :( :-( :) :-) ;-) 8-) :-/ :-P" };

    for (int i = 0; i < messages.length; i++) {
        Instance instance = new DenseInstance(instances.numAttributes());
        instance.setValue(instances.attribute(0), messages[i]);
        instance.setValue(instances.attribute(1), i);
        instance.setValue(instances.attribute(2), Integer.toString(i % 2));
        instances.add(instance);
    }

    System.out.println("Before filter:");
    for (int i = 0; i < instances.size(); i++) {
        System.out.println(instances.instance(i).toString());
    }

    try {
        String dictionaryName = "emoticons.txt";
        StringToDictionaryVector filter = new StringToDictionaryVector();
        List<String> termList = StringToDictionaryVector.readDictionaryFile(new File(dictionaryName));
        filter.setTermList(termList);
        filter.setMinTermFreq(1);
        filter.setTFTransform(true);
        filter.setIDFTransform(true);
        filter.setNormalizeDocLength(new SelectedTag(FILTER_NORMALIZE_TEST_ONLY, TAGS_FILTER));
        filter.setOutputWordCounts(true);
        filter.setStringAttribute("message");

        filter.setInputFormat(instances);
        Instances trans1 = Filter.useFilter(instances, filter);
        Instances trans2 = Filter.useFilter(instances, filter);

        System.out.println("\nFirst application:");
        System.out.println(trans1.toString());

        System.out.println("\nSecond application:");
        System.out.println(trans2.toString());

    } catch (Exception e) {
        e.printStackTrace();
    }
}

From source file:etc.aloe.filters.WordFeaturesExtractor.java

License:Open Source License

protected List<List<String>> tokenizeDocuments(Instances instances) {
    //Convert all instances into term lists
    List<List<String>> documents = new ArrayList<List<String>>();

    for (int i = 0; i < instances.size(); i++) {
        Instance instance = instances.get(i);

        if (instance.isMissing(selectedAttributeIndex) == false) {
            List<String> words = tokenizeDocument(instance);
            documents.add(words);//from w w  w.j  av a2  s.  co m
        }
    }

    return documents;
}

From source file:eu.cassandra.appliance.IsolatedApplianceExtractor.java

License:Apache License

/**
 * This is the constructor of the isolated appliance extractor class. It
 * created the clusters of the isolated events and detects which of them
 * corresponds to the refrigerator./*from  ww w . j a v  a 2s  .  c o  m*/
 * 
 * @param events
 *          The list of all the events detected by the Event Detector.
 * @throws Exception
 */
public IsolatedApplianceExtractor(ArrayList<Event> events) throws Exception {
    // Initializing auxiliary variables
    boolean q1 = false;
    boolean q3 = false;
    boolean pDiff = false;

    // Checking each event. The ones that contain one rising and one reduction
    // points or two reduction points with the second much larger than the
    // first are selected and added to the array
    for (Event event : events) {

        // System.out.println("Event:" + event.getId() + " Rising Points: "
        // + event.getRisingPoints().size()
        // + " Reduction Points: "
        // + event.getReductionPoints().size());

        if (event.getRisingPoints().size() == 1 && event.getReductionPoints().size() == 1) {
            isolated.add(event);
        }
        // else if (event.getRisingPoints().size() == 1
        // && event.getReductionPoints().size() == 2) {
        //
        // q1 = (event.getRisingPoints().get(0).getQDiff() > 0);
        // pDiff =
        // (Math.abs(event.getReductionPoints().get(1).getPDiff()) >
        // Constants.ISOLATED_TIMES_UP
        // * Math.abs(event
        // .getReductionPoints()
        // .get(0)
        // .getPDiff()));
        // q3 = (event.getReductionPoints().get(1).getQDiff() < 0);
        //
        // if (q1 && q3 && pDiff) {
        // event.getReductionPoints().remove(0);
        // isolated.add(event);
        // }
        // }
    }

    // The instances for the cluster procedure are created
    Instances inst = createInstances(isolated);

    // System.out.println(inst.toString());

    if (inst.size() > 0) {

        // The cluster is taking place
        fillClusters(inst);

        // System.out.println("Clusters:" + clusters.toString());

        // The refrigerator cluster is found
        findRefrigerator();

        // System.out.println("Fridge Cluster:" + refrigeratorCluster);

    }

}

From source file:eu.cassandra.appliance.IsolatedApplianceExtractor.java

License:Apache License

/**
 * This is an auxiliary function that prepares the clustering data set. The
 * events must be translated to instances of the data set that can be used for
 * clustering./*from   w  w  w. j  a  v a 2s.  c  o  m*/
 * 
 * @param isolated
 *          The list of the events containing an isolated appliance.
 * @return The instances of the data
 * @throws Exception
 */
private Instances createInstances(ArrayList<Event> isolated) throws Exception {
    // Initializing auxiliary variables namely the attributes of the data set
    Attribute id = new Attribute("id");
    Attribute pDiffRise = new Attribute("pDiffRise");
    Attribute qDiffRise = new Attribute("qDiffRise");
    Attribute pDiffReduce = new Attribute("pDiffReduce");
    Attribute qDiffReduce = new Attribute("qDiffReduce");

    ArrayList<Attribute> attr = new ArrayList<Attribute>();
    attr.add(id);
    attr.add(pDiffRise);
    attr.add(qDiffRise);
    attr.add(pDiffReduce);
    attr.add(qDiffReduce);

    Instances instances = new Instances("Isolated", attr, 0);

    // Each event is translated to an instance with the above attributes
    for (Event event : isolated) {

        Instance inst = new DenseInstance(5);
        inst.setValue(id, event.getId());
        inst.setValue(pDiffRise, event.getRisingPoints().get(0).getPDiff());
        inst.setValue(qDiffRise, event.getRisingPoints().get(0).getQDiff());
        inst.setValue(pDiffReduce, event.getReductionPoints().get(0).getPDiff());
        inst.setValue(qDiffReduce, event.getReductionPoints().get(0).getQDiff());

        instances.add(inst);

    }

    int n = Constants.MAX_CLUSTERS_NUMBER;
    Instances newInst = null;

    System.out.println("Instances: " + instances.toSummaryString());
    System.out.println("Max Clusters: " + n);

    // Create the addcluster filter of Weka and the set up the hierarchical
    // clusterer.
    AddCluster addcluster = new AddCluster();

    if (instances.size() > Constants.KMEANS_LIMIT_NUMBER || instances.size() == 0) {

        HierarchicalClusterer clusterer = new HierarchicalClusterer();

        String[] opt = { "-N", "" + n + "", "-P", "-D", "-L", "AVERAGE" };

        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setNumClusters(n);
        clusterer.setOptions(opt);
        clusterer.setPrintNewick(true);
        clusterer.setDebug(true);

        // clusterer.getOptions();

        addcluster.setClusterer(clusterer);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    } else {

        SimpleKMeans kmeans = new SimpleKMeans();

        kmeans.setSeed(10);

        // This is the important parameter to set
        kmeans.setPreserveInstancesOrder(true);
        kmeans.setNumClusters(n);
        kmeans.buildClusterer(instances);

        addcluster.setClusterer(kmeans);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    }

    return newInst;

}

From source file:eu.cassandra.appliance.IsolatedApplianceExtractor.java

License:Apache License

/**
 * This function is taking the instances coming out from clustering and put
 * each event to each respective cluster.
 * /*from w  w  w .j  a  v  a2s.  c o  m*/
 * @param inst
 *          The clustered instances
 */
private void fillClusters(Instances inst) {
    // Initializing auxiliary variables
    ArrayList<Integer> temp;

    // For each instance check the cluster value and put it to the correct
    // cluster
    for (int i = 0; i < inst.size(); i++) {

        String cluster = inst.get(i).stringValue(inst.attribute(5));

        if (!clusters.containsKey(cluster))
            temp = new ArrayList<Integer>();
        else
            temp = clusters.get(cluster);

        temp.add(i);

        clusters.put(cluster, temp);

    }

}

From source file:eu.cassandra.appliance.IsolatedEventsExtractor.java

License:Apache License

/**
 * This is the constructor of the isolated appliance extractor class. It
 * created the clusters of the isolated events and detects which of them
 * corresponds to the refrigerator./*from w ww .  j  av a  2s.c  o  m*/
 * 
 * @param events
 *          The list of all the events detected by the Event Detector.
 * @throws Exception
 */
public IsolatedEventsExtractor(ArrayList<Event> events) throws Exception {
    log.info("==============ISOLATED EVENTS===============");

    // Initializing auxiliary variables
    boolean q1 = false;
    boolean q3 = false;
    boolean pDiff = false;

    // Checking each event. The ones that contain one rising and one reduction
    // points or two reduction points with the second much larger than the
    // first are selected and added to the array
    for (Event event : events) {

        log.debug("");
        log.debug("Event:" + event.getId() + " Rising Points: " + event.getRisingPoints().size()
                + " Reduction Points: " + event.getReductionPoints().size());

        if (event.getRisingPoints().size() == 1 && event.getReductionPoints().size() == 1) {
            isolated.add(event);

            log.debug("Isolated Event");
        } else if (event.getRisingPoints().size() == 1 && event.getReductionPoints().size() == 2) {

            q1 = (event.getRisingPoints().get(0).getQDiff() > 0);
            pDiff = (Math.abs(event.getReductionPoints().get(1).getPDiff()) > Constants.ISOLATED_TIMES_UP
                    * Math.abs(event.getReductionPoints().get(0).getPDiff()));
            q3 = (event.getReductionPoints().get(1).getQDiff() < 0);

            // log.debug("Q1 > 0:" + q1);
            // log.debug("PDiff:" + pDiff);
            // log.debug("Q3 < 0:" + q3);

            if (q1 && q3 && pDiff) {
                event.getReductionPoints().remove(0);
                isolated.add(event);

                log.debug("Isolated Event");

            }
        }
    }

    log.info("Number of Isolated Events: " + isolated.size());
    log.info("");

    // TODO Add clustering in order to find the correct fridge...

    if (Constants.REF_LOOSE_COUPLING == false) {

        log.info("============FRIDGE CLUSTERING===============");

        Instances inst = createInstances(isolated);

        // log.info(inst.toString());

        if (inst.size() > 0) {
            fillClusters(inst);
            findRefrigerator();
        }

        clusterRefMeans();
    }

    for (Event event : isolated) {
        log.debug("");
        log.debug("Event: " + event.getId());

        event.detectBasicShapes(true);
        if (event.getRisingPoints().size() > 0 && event.getReductionPoints().size() > 0)
            event.detectMatchingPoints(true);
        if (event.getRisingPoints().size() > 0 && event.getReductionPoints().size() > 0)
            event.findCombinations(true);

        event.calculateFinalPairs();
        // event.status2();

    }

}

From source file:eu.cassandra.appliance.IsolatedEventsExtractor.java

License:Apache License

/**
 * This is an auxiliary function that prepares the clustering data set. The
 * events must be translated to instances of the data set that can be used for
 * clustering.//w ww.j a  v a  2 s. c  o  m
 * 
 * @param isolated
 *          The list of the events containing an isolated appliance.
 * @return The instances of the data
 * @throws Exception
 */
private Instances createInstances(ArrayList<Event> isolated) throws Exception {
    // Initializing auxiliary variables namely the attributes of the data set
    Attribute id = new Attribute("id");
    Attribute pDiffRise = new Attribute("pDiffRise");
    Attribute qDiffRise = new Attribute("qDiffRise");
    Attribute pDiffReduce = new Attribute("pDiffReduce");
    Attribute qDiffReduce = new Attribute("qDiffReduce");
    Attribute duration = new Attribute("duration");

    ArrayList<Attribute> attr = new ArrayList<Attribute>();
    attr.add(id);
    attr.add(pDiffRise);
    attr.add(qDiffRise);
    attr.add(pDiffReduce);
    attr.add(qDiffReduce);
    attr.add(duration);

    Instances instances = new Instances("Isolated", attr, 0);

    // Each event is translated to an instance with the above attributes
    for (Event event : isolated) {

        Instance inst = new DenseInstance(6);
        inst.setValue(id, event.getId());
        inst.setValue(pDiffRise, event.getRisingPoints().get(0).getPDiff());
        inst.setValue(qDiffRise, event.getRisingPoints().get(0).getQDiff());
        inst.setValue(pDiffReduce, event.getReductionPoints().get(0).getPDiff());
        inst.setValue(qDiffReduce, event.getReductionPoints().get(0).getQDiff());
        inst.setValue(duration, event.getEndMinute() - event.getStartMinute());
        instances.add(inst);

    }

    int n = Constants.MAX_CLUSTERS_NUMBER;
    Instances newInst = null;

    log.info("Instances: " + instances.toSummaryString());
    log.info("Max Clusters: " + n);

    // Create the addcluster filter of Weka and the set up the hierarchical
    // clusterer.
    AddCluster addcluster = new AddCluster();

    if (instances.size() > Constants.KMEANS_LIMIT_NUMBER || instances.size() == 0) {

        HierarchicalClusterer clusterer = new HierarchicalClusterer();

        String[] opt = { "-N", "" + n + "", "-P", "-D", "-L", "AVERAGE" };

        clusterer.setDistanceFunction(new EuclideanDistance());
        clusterer.setNumClusters(n);
        clusterer.setOptions(opt);
        clusterer.setPrintNewick(true);
        clusterer.setDebug(true);

        // clusterer.getOptions();

        addcluster.setClusterer(clusterer);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    } else {

        SimpleKMeans kmeans = new SimpleKMeans();

        kmeans.setSeed(10);

        // This is the important parameter to set
        kmeans.setPreserveInstancesOrder(true);
        kmeans.setNumClusters(n);
        kmeans.buildClusterer(instances);

        addcluster.setClusterer(kmeans);
        addcluster.setInputFormat(instances);
        addcluster.setIgnoredAttributeIndices("1");

        // Cluster data set
        newInst = Filter.useFilter(instances, addcluster);

    }

    return newInst;

}

From source file:eu.cassandra.appliance.IsolatedEventsExtractor.java

License:Apache License

/**
 * This function is taking the instances coming out from clustering and put
 * each event to each respective cluster.
 * /*from   www  .  j  av a  2s.  c o m*/
 * @param inst
 *          The clustered instances
 */
private void fillClusters(Instances inst) {
    // Initializing auxiliary variables
    ArrayList<Integer> temp;

    // For each instance check the cluster value and put it to the correct
    // cluster
    for (int i = 0; i < inst.size(); i++) {

        String cluster = inst.get(i).stringValue(inst.attribute(6));

        if (!clusters.containsKey(cluster))
            temp = new ArrayList<Integer>();
        else
            temp = clusters.get(cluster);

        temp.add(i);

        clusters.put(cluster, temp);

    }

}