List of usage examples for weka.core Instances size
@Override publicint size()
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
private int[] determineDictionary(Instances instances) { if (stringAttributeIndex < 0) { throw new IllegalStateException("String attribute index not valid"); }/* w w w. j a va 2 s . c om*/ // Operate on a per-class basis if class attribute is set int classInd = instances.classIndex(); int values = 1; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { values = instances.attribute(classInd).numValues(); } HashMap<String, Integer> termIndices = new HashMap<String, Integer>(); for (int i = 0; i < termList.size(); i++) { termIndices.put(termList.get(i), i); } //Create the trie for matching terms Trie termTrie = new Trie(termList); //Initialize the dictionary/count map ArrayList<HashMap<Integer, Count>> termCounts = new ArrayList<HashMap<Integer, Count>>(); for (int z = 0; z < values; z++) { termCounts.add(new HashMap<Integer, Count>()); } //Go through all the instances and count the emoticons for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); int vInd = 0; if (!m_doNotOperateOnPerClassBasis && (classInd != -1)) { vInd = (int) instance.classValue(); } //Get the string attribute to examine String stringValue = instance.stringValue(stringAttributeIndex); HashMap<Integer, Count> termCountsForClass = termCounts.get(vInd); HashMap<String, Integer> termMatches = termTrie.countNonoverlappingMatches(stringValue); for (Map.Entry<String, Integer> entry : termMatches.entrySet()) { String term = entry.getKey(); int termIdx = termIndices.get(term); int matches = entry.getValue(); Count count = termCountsForClass.get(termIdx); if (count == null) { count = new Count(0); termCountsForClass.put(termIdx, count); } if (matches > 0) { count.docCount += 1; count.count += matches; } } } // Figure out the minimum required word frequency int prune[] = new int[values]; for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); int array[] = new int[termCountsForClass.size()]; int pos = 0; for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) { array[pos] = entry.getValue().count; pos++; } // sort the array sortArray(array); if (array.length < m_WordsToKeep) { // if there aren't enough words, set the threshold to // minFreq prune[z] = m_minTermFreq; } else { // otherwise set it to be at least minFreq prune[z] = Math.max(m_minTermFreq, array[array.length - m_WordsToKeep]); } } // Add the word vector attributes (eliminating duplicates // that occur in multiple classes) HashSet<String> selectedTerms = new HashSet<String>(); for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); for (Map.Entry<Integer, Count> entry : termCountsForClass.entrySet()) { int termIndex = entry.getKey(); String term = termList.get(termIndex); Count count = entry.getValue(); if (count.count >= prune[z]) { selectedTerms.add(term); } } } //Save the selected terms as a list this.m_selectedTerms = new ArrayList<String>(selectedTerms); this.m_selectedTermsTrie = new Trie(this.m_selectedTerms); this.m_NumInstances = instances.size(); //Construct the selected terms to index map this.m_selectedTermIndices = new HashMap<String, Integer>(); for (int i = 0; i < m_selectedTerms.size(); i++) { m_selectedTermIndices.put(m_selectedTerms.get(i), i); } // Compute document frequencies, organized by selected term index (not original term index) int[] docsCounts = new int[m_selectedTerms.size()]; for (int i = 0; i < m_selectedTerms.size(); i++) { String term = m_selectedTerms.get(i); int termIndex = termIndices.get(term); int docsCount = 0; for (int z = 0; z < values; z++) { HashMap<Integer, Count> termCountsForClass = termCounts.get(z); Count count = termCountsForClass.get(termIndex); if (count != null) { docsCount += count.docCount; } } docsCounts[i] = docsCount; } return docsCounts; }
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
@Override protected Instances process(Instances instances) throws Exception { Instances result = new Instances(getOutputFormat(), 0); // Convert all instances w/o normalization ArrayList<Instance> converted = new ArrayList<Instance>(); ArrayList<Double> docLengths = new ArrayList<Double>(); if (!isFirstBatchDone()) { m_AvgDocLength = 0;//from w w w . j a va 2 s.c o m } for (int i = 0; i < instances.size(); i++) { double docLength = convertInstancewoDocNorm(instances.instance(i), converted); // Need to compute average document length if necessary if (m_filterType != FILTER_NONE) { if (!isFirstBatchDone()) { m_AvgDocLength += docLength; } docLengths.add(docLength); } } if (m_filterType != FILTER_NONE) { if (!isFirstBatchDone()) { m_AvgDocLength /= instances.size(); } // Perform normalization if necessary. if (isFirstBatchDone() || (!isFirstBatchDone() && m_filterType == FILTER_NORMALIZE_ALL)) { for (int i = 0; i < converted.size(); i++) { normalizeInstance(converted.get(i), docLengths.get(i)); } } } // Push all instances into the output queue for (int i = 0; i < converted.size(); i++) { result.add(converted.get(i)); } return result; }
From source file:etc.aloe.filters.StringToDictionaryVector.java
License:Open Source License
public static void main(String[] args) { //Create a test dataset ArrayList<Attribute> attributes = new ArrayList<Attribute>(); attributes.add(new Attribute("message", (ArrayList<String>) null)); attributes.add(new Attribute("id")); {/* ww w . j a v a 2 s . com*/ ArrayList<String> classValues = new ArrayList<String>(); classValues.add("0"); classValues.add("1"); attributes.add(new Attribute("class", classValues)); } Instances instances = new Instances("test", attributes, 0); instances.setClassIndex(2); String[] messages = new String[] { "No emoticons here", "I have a smiley :)", "Two smileys and a frownie :) :) :(", "Several emoticons :( :-( :) :-) ;-) 8-) :-/ :-P" }; for (int i = 0; i < messages.length; i++) { Instance instance = new DenseInstance(instances.numAttributes()); instance.setValue(instances.attribute(0), messages[i]); instance.setValue(instances.attribute(1), i); instance.setValue(instances.attribute(2), Integer.toString(i % 2)); instances.add(instance); } System.out.println("Before filter:"); for (int i = 0; i < instances.size(); i++) { System.out.println(instances.instance(i).toString()); } try { String dictionaryName = "emoticons.txt"; StringToDictionaryVector filter = new StringToDictionaryVector(); List<String> termList = StringToDictionaryVector.readDictionaryFile(new File(dictionaryName)); filter.setTermList(termList); filter.setMinTermFreq(1); filter.setTFTransform(true); filter.setIDFTransform(true); filter.setNormalizeDocLength(new SelectedTag(FILTER_NORMALIZE_TEST_ONLY, TAGS_FILTER)); filter.setOutputWordCounts(true); filter.setStringAttribute("message"); filter.setInputFormat(instances); Instances trans1 = Filter.useFilter(instances, filter); Instances trans2 = Filter.useFilter(instances, filter); System.out.println("\nFirst application:"); System.out.println(trans1.toString()); System.out.println("\nSecond application:"); System.out.println(trans2.toString()); } catch (Exception e) { e.printStackTrace(); } }
From source file:etc.aloe.filters.WordFeaturesExtractor.java
License:Open Source License
protected List<List<String>> tokenizeDocuments(Instances instances) { //Convert all instances into term lists List<List<String>> documents = new ArrayList<List<String>>(); for (int i = 0; i < instances.size(); i++) { Instance instance = instances.get(i); if (instance.isMissing(selectedAttributeIndex) == false) { List<String> words = tokenizeDocument(instance); documents.add(words);//from w w w.j av a2 s. co m } } return documents; }
From source file:eu.cassandra.appliance.IsolatedApplianceExtractor.java
License:Apache License
/** * This is the constructor of the isolated appliance extractor class. It * created the clusters of the isolated events and detects which of them * corresponds to the refrigerator./*from ww w . j a v a 2s . c o m*/ * * @param events * The list of all the events detected by the Event Detector. * @throws Exception */ public IsolatedApplianceExtractor(ArrayList<Event> events) throws Exception { // Initializing auxiliary variables boolean q1 = false; boolean q3 = false; boolean pDiff = false; // Checking each event. The ones that contain one rising and one reduction // points or two reduction points with the second much larger than the // first are selected and added to the array for (Event event : events) { // System.out.println("Event:" + event.getId() + " Rising Points: " // + event.getRisingPoints().size() // + " Reduction Points: " // + event.getReductionPoints().size()); if (event.getRisingPoints().size() == 1 && event.getReductionPoints().size() == 1) { isolated.add(event); } // else if (event.getRisingPoints().size() == 1 // && event.getReductionPoints().size() == 2) { // // q1 = (event.getRisingPoints().get(0).getQDiff() > 0); // pDiff = // (Math.abs(event.getReductionPoints().get(1).getPDiff()) > // Constants.ISOLATED_TIMES_UP // * Math.abs(event // .getReductionPoints() // .get(0) // .getPDiff())); // q3 = (event.getReductionPoints().get(1).getQDiff() < 0); // // if (q1 && q3 && pDiff) { // event.getReductionPoints().remove(0); // isolated.add(event); // } // } } // The instances for the cluster procedure are created Instances inst = createInstances(isolated); // System.out.println(inst.toString()); if (inst.size() > 0) { // The cluster is taking place fillClusters(inst); // System.out.println("Clusters:" + clusters.toString()); // The refrigerator cluster is found findRefrigerator(); // System.out.println("Fridge Cluster:" + refrigeratorCluster); } }
From source file:eu.cassandra.appliance.IsolatedApplianceExtractor.java
License:Apache License
/** * This is an auxiliary function that prepares the clustering data set. The * events must be translated to instances of the data set that can be used for * clustering./*from w w w. j a v a 2s. c o m*/ * * @param isolated * The list of the events containing an isolated appliance. * @return The instances of the data * @throws Exception */ private Instances createInstances(ArrayList<Event> isolated) throws Exception { // Initializing auxiliary variables namely the attributes of the data set Attribute id = new Attribute("id"); Attribute pDiffRise = new Attribute("pDiffRise"); Attribute qDiffRise = new Attribute("qDiffRise"); Attribute pDiffReduce = new Attribute("pDiffReduce"); Attribute qDiffReduce = new Attribute("qDiffReduce"); ArrayList<Attribute> attr = new ArrayList<Attribute>(); attr.add(id); attr.add(pDiffRise); attr.add(qDiffRise); attr.add(pDiffReduce); attr.add(qDiffReduce); Instances instances = new Instances("Isolated", attr, 0); // Each event is translated to an instance with the above attributes for (Event event : isolated) { Instance inst = new DenseInstance(5); inst.setValue(id, event.getId()); inst.setValue(pDiffRise, event.getRisingPoints().get(0).getPDiff()); inst.setValue(qDiffRise, event.getRisingPoints().get(0).getQDiff()); inst.setValue(pDiffReduce, event.getReductionPoints().get(0).getPDiff()); inst.setValue(qDiffReduce, event.getReductionPoints().get(0).getQDiff()); instances.add(inst); } int n = Constants.MAX_CLUSTERS_NUMBER; Instances newInst = null; System.out.println("Instances: " + instances.toSummaryString()); System.out.println("Max Clusters: " + n); // Create the addcluster filter of Weka and the set up the hierarchical // clusterer. AddCluster addcluster = new AddCluster(); if (instances.size() > Constants.KMEANS_LIMIT_NUMBER || instances.size() == 0) { HierarchicalClusterer clusterer = new HierarchicalClusterer(); String[] opt = { "-N", "" + n + "", "-P", "-D", "-L", "AVERAGE" }; clusterer.setDistanceFunction(new EuclideanDistance()); clusterer.setNumClusters(n); clusterer.setOptions(opt); clusterer.setPrintNewick(true); clusterer.setDebug(true); // clusterer.getOptions(); addcluster.setClusterer(clusterer); addcluster.setInputFormat(instances); addcluster.setIgnoredAttributeIndices("1"); // Cluster data set newInst = Filter.useFilter(instances, addcluster); } else { SimpleKMeans kmeans = new SimpleKMeans(); kmeans.setSeed(10); // This is the important parameter to set kmeans.setPreserveInstancesOrder(true); kmeans.setNumClusters(n); kmeans.buildClusterer(instances); addcluster.setClusterer(kmeans); addcluster.setInputFormat(instances); addcluster.setIgnoredAttributeIndices("1"); // Cluster data set newInst = Filter.useFilter(instances, addcluster); } return newInst; }
From source file:eu.cassandra.appliance.IsolatedApplianceExtractor.java
License:Apache License
/** * This function is taking the instances coming out from clustering and put * each event to each respective cluster. * /*from w w w .j a v a2s. c o m*/ * @param inst * The clustered instances */ private void fillClusters(Instances inst) { // Initializing auxiliary variables ArrayList<Integer> temp; // For each instance check the cluster value and put it to the correct // cluster for (int i = 0; i < inst.size(); i++) { String cluster = inst.get(i).stringValue(inst.attribute(5)); if (!clusters.containsKey(cluster)) temp = new ArrayList<Integer>(); else temp = clusters.get(cluster); temp.add(i); clusters.put(cluster, temp); } }
From source file:eu.cassandra.appliance.IsolatedEventsExtractor.java
License:Apache License
/** * This is the constructor of the isolated appliance extractor class. It * created the clusters of the isolated events and detects which of them * corresponds to the refrigerator./*from w ww . j av a 2s.c o m*/ * * @param events * The list of all the events detected by the Event Detector. * @throws Exception */ public IsolatedEventsExtractor(ArrayList<Event> events) throws Exception { log.info("==============ISOLATED EVENTS==============="); // Initializing auxiliary variables boolean q1 = false; boolean q3 = false; boolean pDiff = false; // Checking each event. The ones that contain one rising and one reduction // points or two reduction points with the second much larger than the // first are selected and added to the array for (Event event : events) { log.debug(""); log.debug("Event:" + event.getId() + " Rising Points: " + event.getRisingPoints().size() + " Reduction Points: " + event.getReductionPoints().size()); if (event.getRisingPoints().size() == 1 && event.getReductionPoints().size() == 1) { isolated.add(event); log.debug("Isolated Event"); } else if (event.getRisingPoints().size() == 1 && event.getReductionPoints().size() == 2) { q1 = (event.getRisingPoints().get(0).getQDiff() > 0); pDiff = (Math.abs(event.getReductionPoints().get(1).getPDiff()) > Constants.ISOLATED_TIMES_UP * Math.abs(event.getReductionPoints().get(0).getPDiff())); q3 = (event.getReductionPoints().get(1).getQDiff() < 0); // log.debug("Q1 > 0:" + q1); // log.debug("PDiff:" + pDiff); // log.debug("Q3 < 0:" + q3); if (q1 && q3 && pDiff) { event.getReductionPoints().remove(0); isolated.add(event); log.debug("Isolated Event"); } } } log.info("Number of Isolated Events: " + isolated.size()); log.info(""); // TODO Add clustering in order to find the correct fridge... if (Constants.REF_LOOSE_COUPLING == false) { log.info("============FRIDGE CLUSTERING==============="); Instances inst = createInstances(isolated); // log.info(inst.toString()); if (inst.size() > 0) { fillClusters(inst); findRefrigerator(); } clusterRefMeans(); } for (Event event : isolated) { log.debug(""); log.debug("Event: " + event.getId()); event.detectBasicShapes(true); if (event.getRisingPoints().size() > 0 && event.getReductionPoints().size() > 0) event.detectMatchingPoints(true); if (event.getRisingPoints().size() > 0 && event.getReductionPoints().size() > 0) event.findCombinations(true); event.calculateFinalPairs(); // event.status2(); } }
From source file:eu.cassandra.appliance.IsolatedEventsExtractor.java
License:Apache License
/** * This is an auxiliary function that prepares the clustering data set. The * events must be translated to instances of the data set that can be used for * clustering.//w ww.j a v a 2 s. c o m * * @param isolated * The list of the events containing an isolated appliance. * @return The instances of the data * @throws Exception */ private Instances createInstances(ArrayList<Event> isolated) throws Exception { // Initializing auxiliary variables namely the attributes of the data set Attribute id = new Attribute("id"); Attribute pDiffRise = new Attribute("pDiffRise"); Attribute qDiffRise = new Attribute("qDiffRise"); Attribute pDiffReduce = new Attribute("pDiffReduce"); Attribute qDiffReduce = new Attribute("qDiffReduce"); Attribute duration = new Attribute("duration"); ArrayList<Attribute> attr = new ArrayList<Attribute>(); attr.add(id); attr.add(pDiffRise); attr.add(qDiffRise); attr.add(pDiffReduce); attr.add(qDiffReduce); attr.add(duration); Instances instances = new Instances("Isolated", attr, 0); // Each event is translated to an instance with the above attributes for (Event event : isolated) { Instance inst = new DenseInstance(6); inst.setValue(id, event.getId()); inst.setValue(pDiffRise, event.getRisingPoints().get(0).getPDiff()); inst.setValue(qDiffRise, event.getRisingPoints().get(0).getQDiff()); inst.setValue(pDiffReduce, event.getReductionPoints().get(0).getPDiff()); inst.setValue(qDiffReduce, event.getReductionPoints().get(0).getQDiff()); inst.setValue(duration, event.getEndMinute() - event.getStartMinute()); instances.add(inst); } int n = Constants.MAX_CLUSTERS_NUMBER; Instances newInst = null; log.info("Instances: " + instances.toSummaryString()); log.info("Max Clusters: " + n); // Create the addcluster filter of Weka and the set up the hierarchical // clusterer. AddCluster addcluster = new AddCluster(); if (instances.size() > Constants.KMEANS_LIMIT_NUMBER || instances.size() == 0) { HierarchicalClusterer clusterer = new HierarchicalClusterer(); String[] opt = { "-N", "" + n + "", "-P", "-D", "-L", "AVERAGE" }; clusterer.setDistanceFunction(new EuclideanDistance()); clusterer.setNumClusters(n); clusterer.setOptions(opt); clusterer.setPrintNewick(true); clusterer.setDebug(true); // clusterer.getOptions(); addcluster.setClusterer(clusterer); addcluster.setInputFormat(instances); addcluster.setIgnoredAttributeIndices("1"); // Cluster data set newInst = Filter.useFilter(instances, addcluster); } else { SimpleKMeans kmeans = new SimpleKMeans(); kmeans.setSeed(10); // This is the important parameter to set kmeans.setPreserveInstancesOrder(true); kmeans.setNumClusters(n); kmeans.buildClusterer(instances); addcluster.setClusterer(kmeans); addcluster.setInputFormat(instances); addcluster.setIgnoredAttributeIndices("1"); // Cluster data set newInst = Filter.useFilter(instances, addcluster); } return newInst; }
From source file:eu.cassandra.appliance.IsolatedEventsExtractor.java
License:Apache License
/** * This function is taking the instances coming out from clustering and put * each event to each respective cluster. * /*from www . j av a 2s. c o m*/ * @param inst * The clustered instances */ private void fillClusters(Instances inst) { // Initializing auxiliary variables ArrayList<Integer> temp; // For each instance check the cluster value and put it to the correct // cluster for (int i = 0; i < inst.size(); i++) { String cluster = inst.get(i).stringValue(inst.attribute(6)); if (!clusters.containsKey(cluster)) temp = new ArrayList<Integer>(); else temp = clusters.get(cluster); temp.add(i); clusters.put(cluster, temp); } }