List of usage examples for weka.core FastVector FastVector
public FastVector()
From source file:DocClassifier.java
private FastVector createTerms(File[] files) { try {/*from w w w .j av a 2s .co m*/ Set<String> termSet = new HashSet<String>(); for (File file : files) { BufferedReader reader = new BufferedReader(new FileReader(file)); Set<String> docTermSet = new HashSet<String>(); while (reader.ready()) { String line = reader.readLine(); String[] words = line.split(" "); for (String word : words) { Kelime[] kelimeler = this.zemberek.kelimeCozumle(word); if (kelimeler.length > 0) { String kok = kelimeler[0].kok().icerik(); docTermSet.add(kok); termSet.add(kok); } } } // DF for a doc for (String t : docTermSet) { Double freq = this.idfMap.get(t); this.idfMap.put(t, ((freq != null) ? (freq + 1) : 1)); } reader.close(); } //Remove some words like ve,veya,de,da,in from set termSet = PreProcesser.filterTermSet(termSet); //IDF Calculation for (String t : termSet) { Double df = this.idfMap.get(t); if (df != null) { this.idfMap.put(t, Math.log(files.length / df) / Math.log(2)); } else { this.idfMap.put(t, 0.0); } //System.out.println(t + ": " + df); } // Attribute creation //System.err.println("\nAttribute:"); FastVector terms = new FastVector(); for (String term : termSet) { terms.addElement(new Attribute(term)); // System.err.println(term + "-"); } // Class values are created Set<String> classSet = new HashSet<String>(); for (File file : files) { classSet.add(file.getName().substring(0, 3).toLowerCase()); } //System.err.println("\nClass:"); this.classValues = new FastVector(); for (String category : classSet) { this.classValues.addElement(category); // System.out.print(category + "-"); } terms.addElement(new Attribute(CLASS_ATTR_NAME, classValues)); return terms; } catch (FileNotFoundException ex) { Logger.getLogger(DocClassifier.class.getName()).log(Level.SEVERE, null, ex); } catch (IOException ex) { Logger.getLogger(DocClassifier.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:TextDirectoryLoader.java
License:Open Source License
/** * Determines and returns (if possible) the structure (internally the * header) of the data set as an empty set of instances. * * @return the structure of the data set as an empty * set of Instances//from www . j ava 2s.co m * @throws IOException if an error occurs */ public Instances getStructure() throws IOException { if (getDirectory() == null) { throw new IOException("No directory/source has been specified"); } // determine class labels, i.e., sub-dirs if (m_structure == null) { String directoryPath = getDirectory().getAbsolutePath(); FastVector atts = new FastVector(); FastVector classes = new FastVector(); File dir = new File(directoryPath); String[] subdirs = dir.list(); for (int i = 0; i < subdirs.length; i++) { File subdir = new File(directoryPath + File.separator + subdirs[i]); if (subdir.isDirectory()) classes.addElement(subdirs[i]); } atts.addElement(new Attribute("text", (FastVector) null)); if (m_OutputFilename) atts.addElement(new Attribute("filename", (FastVector) null)); // make sure that the name of the class attribute is unlikely to // clash with any attribute created via the StringToWordVector filter atts.addElement(new Attribute("@@class@@", classes)); String relName = directoryPath.replaceAll("/", "_"); relName = relName.replaceAll("\\\\", "_").replaceAll(":", "_"); m_structure = new Instances(relName, atts, 0); m_structure.setClassIndex(m_structure.numAttributes() - 1); } return m_structure; }
From source file:TextDirectoryLoader.java
License:Open Source License
/** * Return the full data set. If the structure hasn't yet been determined * by a call to getStructure then method should do so before processing * the rest of the data set./* w ww . j a v a2s . co m*/ * * @return the structure of the data set as an empty set of Instances * @throws IOException if there is no source or parsing fails */ public Instances getDataSet() throws IOException { if (getDirectory() == null) throw new IOException("No directory/source has been specified"); String directoryPath = getDirectory().getAbsolutePath(); FastVector classes = new FastVector(); Enumeration enm = getStructure().classAttribute().enumerateValues(); while (enm.hasMoreElements()) classes.addElement(enm.nextElement()); Instances data = getStructure(); int fileCount = 0; for (int k = 0; k < classes.size(); k++) { String subdirPath = (String) classes.elementAt(k); File subdir = new File(directoryPath + File.separator + subdirPath); String[] files = subdir.list(); for (int j = 0; j < files.length; j++) { try { fileCount++; if (getDebug()) System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]); double[] newInst = null; if (m_OutputFilename) newInst = new double[3]; else newInst = new double[2]; File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]); BufferedReader is; if (m_charSet == null || m_charSet.length() == 0) { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt))); } else { is = new BufferedReader(new InputStreamReader(new FileInputStream(txt), m_charSet)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); if (m_OutputFilename) newInst[1] = (double) data.attribute(1) .addStringValue(subdirPath + File.separator + files[j]); newInst[data.classIndex()] = (double) k; data.add(new Instance(1.0, newInst)); is.close(); } catch (Exception e) { System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath + File.separator + files[j]); } } } return data; }
From source file:ArrayLoader.java
License:Open Source License
/** * Attempts to parse a line of the data set. * * @param tokenizer the tokenizer// w w w.j a v a 2 s .c o m * @return a FastVector containg String and Double objects representing * the values of the instance. * @exception IOException if an error occurs * * <pre><jml> * private_normal_behavior * requires: tokenizer != null; * ensures: \result != null; * also * private_exceptional_behavior * requires: tokenizer == null * || (* unsucessful parse *); * signals: (IOException); * </jml></pre> */ private FastVector getInstance(String[] data) throws IOException { FastVector current = new FastVector(); for (int i = 0; i < data.length; i++) { if (data[i].equals(m_MissingValue)) { current.addElement(new String(m_MissingValue)); } else { // try to parse as a number try { double val = Double.valueOf(data[i]).doubleValue(); current.addElement(new Double(val)); } catch (NumberFormatException e) { // otherwise assume its an enumerated value current.addElement(new String(data[i])); } } } // check number of values read if (current.size() != m_structure.numAttributes()) { System.out.println("Incorrect Structure"); } // check for structure update try { checkStructure(current); } catch (Exception ex) { ex.printStackTrace(); } return current; }
From source file:ArrayLoader.java
License:Open Source License
/** * Assumes the first line of the file contains the attribute names. * Assumes all attributes are real (Reading the full data set with * getDataSet will establish the true structure). * *//*from w w w . ja v a 2 s .c o m*/ private void readHeader(String[] column) throws IOException { FastVector attribNames = new FastVector(); // Assume first row of data are the column titles for (int i = 0; i < column.length; i++) { attribNames.addElement(new Attribute(column[i])); } m_structure = new Instances("DataArray", attribNames, 0); }
From source file:LabeledItemSet.java
License:Open Source License
/** * Merges all item sets in the set of (k-1)-item sets * to create the (k)-item sets and updates the counters. * @return the generated (k)-item sets/*w w w . ja v a2s . c o m*/ * @param totalTrans the total number of transactions * @param itemSets the set of (k-1)-item sets * @param size the value of (k-1) */ public static FastVector mergeAllItemSets(FastVector itemSets, int size, int totalTrans) { FastVector newVector = new FastVector(); LabeledItemSet result; int numFound, k; for (int i = 0; i < itemSets.size(); i++) { LabeledItemSet first = (LabeledItemSet) itemSets.elementAt(i); out: for (int j = i + 1; j < itemSets.size(); j++) { LabeledItemSet second = (LabeledItemSet) itemSets.elementAt(j); while (first.m_classLabel != second.m_classLabel) { j++; if (j == itemSets.size()) break out; second = (LabeledItemSet) itemSets.elementAt(j); } result = new LabeledItemSet(totalTrans, first.m_classLabel); result.m_items = new int[first.m_items.length]; // Find and copy common prefix of size 'size' numFound = 0; k = 0; while (numFound < size) { if (first.m_items[k] == second.m_items[k]) { if (first.m_items[k] != -1) numFound++; result.m_items[k] = first.m_items[k]; } else break out; k++; } // Check difference while (k < first.m_items.length) { if ((first.m_items[k] != -1) && (second.m_items[k] != -1)) break; else { if (first.m_items[k] != -1) result.m_items[k] = first.m_items[k]; else result.m_items[k] = second.m_items[k]; } k++; } if (k == first.m_items.length) { result.m_ruleSupCounter = 0; result.m_counter = 0; newVector.addElement(result); } } } return newVector; }
From source file:LabeledItemSet.java
License:Open Source License
/** * Converts the header info of the given set of instances into a set * of item sets (singletons). The ordering of values in the header file * determines the lexicographic order. Each item set knows its class label. * @return a set of item sets, each containing a single item * @param instancesNoClass instances without the class attribute * @param classes the values of the class attribute sorted according to instances * @exception Exception if singletons can't be generated successfully *///from w w w . j a v a 2 s. co m public static FastVector singletons(Instances instancesNoClass, Instances classes) throws Exception { FastVector cSet, setOfItemSets = new FastVector(); LabeledItemSet current; //make singletons for (int i = 0; i < instancesNoClass.numAttributes(); i++) { if (instancesNoClass.attribute(i).isNumeric()) throw new Exception("Can't handle numeric attributes!"); for (int j = 0; j < instancesNoClass.attribute(i).numValues(); j++) { for (int k = 0; k < (classes.attribute(0)).numValues(); k++) { current = new LabeledItemSet(instancesNoClass.numInstances(), k); current.m_items = new int[instancesNoClass.numAttributes()]; for (int l = 0; l < instancesNoClass.numAttributes(); l++) current.m_items[l] = -1; current.m_items[i] = j; setOfItemSets.addElement(current); } } } return setOfItemSets; }
From source file:LabeledItemSet.java
License:Open Source License
/** * Generates rules out of item sets//from w w w . j a v a2 s . co m * @param minConfidence the minimum confidence * @param noPrune flag indicating whether the rules are pruned accoridng to the minimum confidence value * @return a set of rules */ public final FastVector[] generateRules(double minConfidence, boolean noPrune) { FastVector premises = new FastVector(), consequences = new FastVector(), conf = new FastVector(); FastVector[] rules = new FastVector[3]; ItemSet premise, consequence; // Generate all rules with class in the consequence. premise = new ItemSet(m_totalTransactions); consequence = new ItemSet(m_totalTransactions); int[] premiseItems = new int[m_items.length]; int[] consequenceItems = new int[1]; System.arraycopy(m_items, 0, premiseItems, 0, m_items.length); consequence.setItem(consequenceItems); premise.setItem(premiseItems); consequence.setItemAt(m_classLabel, 0); consequence.setCounter(this.m_ruleSupCounter); premise.setCounter(this.m_counter); premises.addElement(premise); consequences.addElement(consequence); conf.addElement(new Double((double) this.m_ruleSupCounter / (double) this.m_counter)); rules[0] = premises; rules[1] = consequences; rules[2] = conf; if (!noPrune) pruneRules(rules, minConfidence); return rules; }
From source file:MultiClassClassifier.java
License:Open Source License
/** * Builds the classifiers.//from w w w . j av a2s. c o m * * @param insts the training data. * @throws Exception if a classifier can't be built */ public void buildClassifier(Instances insts) throws Exception { Instances newInsts; // can classifier handle the data? getCapabilities().testWithFail(insts); // remove instances with missing class insts = new Instances(insts); insts.deleteWithMissingClass(); if (m_Classifier == null) { throw new Exception("No base classifier has been set!"); } m_ZeroR = new ZeroR(); m_ZeroR.buildClassifier(insts); m_TwoClassDataset = null; int numClassifiers = insts.numClasses(); if (numClassifiers <= 2) { m_Classifiers = Classifier.makeCopies(m_Classifier, 1); m_Classifiers[0].buildClassifier(insts); m_ClassFilters = null; } else if (m_Method == METHOD_1_AGAINST_1) { // generate fastvector of pairs FastVector pairs = new FastVector(); for (int i = 0; i < insts.numClasses(); i++) { for (int j = 0; j < insts.numClasses(); j++) { if (j <= i) continue; int[] pair = new int[2]; pair[0] = i; pair[1] = j; pairs.addElement(pair); } } numClassifiers = pairs.size(); m_Classifiers = Classifier.makeCopies(m_Classifier, numClassifiers); m_ClassFilters = new Filter[numClassifiers]; m_SumOfWeights = new double[numClassifiers]; // generate the classifiers for (int i = 0; i < numClassifiers; i++) { RemoveWithValues classFilter = new RemoveWithValues(); classFilter.setAttributeIndex("" + (insts.classIndex() + 1)); classFilter.setModifyHeader(true); classFilter.setInvertSelection(true); classFilter.setNominalIndicesArr((int[]) pairs.elementAt(i)); Instances tempInstances = new Instances(insts, 0); tempInstances.setClassIndex(-1); classFilter.setInputFormat(tempInstances); newInsts = Filter.useFilter(insts, classFilter); if (newInsts.numInstances() > 0) { newInsts.setClassIndex(insts.classIndex()); m_Classifiers[i].buildClassifier(newInsts); m_ClassFilters[i] = classFilter; m_SumOfWeights[i] = newInsts.sumOfWeights(); } else { m_Classifiers[i] = null; m_ClassFilters[i] = null; } } // construct a two-class header version of the dataset m_TwoClassDataset = new Instances(insts, 0); int classIndex = m_TwoClassDataset.classIndex(); m_TwoClassDataset.setClassIndex(-1); m_TwoClassDataset.deleteAttributeAt(classIndex); FastVector classLabels = new FastVector(); classLabels.addElement("class0"); classLabels.addElement("class1"); m_TwoClassDataset.insertAttributeAt(new Attribute("class", classLabels), classIndex); m_TwoClassDataset.setClassIndex(classIndex); } else { // use error correcting code style methods Code code = null; switch (m_Method) { case METHOD_ERROR_EXHAUSTIVE: code = new ExhaustiveCode(numClassifiers); break; case METHOD_ERROR_RANDOM: code = new RandomCode(numClassifiers, (int) (numClassifiers * m_RandomWidthFactor), insts); break; case METHOD_1_AGAINST_ALL: code = new StandardCode(numClassifiers); break; default: throw new Exception("Unrecognized correction code type"); } numClassifiers = code.size(); m_Classifiers = Classifier.makeCopies(m_Classifier, numClassifiers); m_ClassFilters = new MakeIndicator[numClassifiers]; for (int i = 0; i < m_Classifiers.length; i++) { m_ClassFilters[i] = new MakeIndicator(); MakeIndicator classFilter = (MakeIndicator) m_ClassFilters[i]; classFilter.setAttributeIndex("" + (insts.classIndex() + 1)); classFilter.setValueIndices(code.getIndices(i)); classFilter.setNumeric(false); classFilter.setInputFormat(insts); newInsts = Filter.useFilter(insts, m_ClassFilters[i]); m_Classifiers[i].buildClassifier(newInsts); } } m_ClassAttribute = insts.classAttribute(); }
From source file:algoritmogeneticocluster.NewClass.java
public static void main(String[] args) throws Exception { BufferedReader datafile = readDataFile("tabela10.arff"); Instances data = new Instances(datafile); data.setClassIndex(data.numAttributes() - 1); // Do 10-split cross validation Instances[][] split = crossValidationSplit(data, 10); // Separate split into training and testing arrays Instances[] trainingSplits = split[0]; Instances[] testingSplits = split[1]; // Use a set of classifiers Classifier[] models = { new SMO(), new J48(), // a decision tree new PART(), new DecisionTable(), //decision table majority classifier new DecisionStump() //one-level decision tree };//from ww w .jav a2 s . c om // Run for each model for (int j = 0; j < models.length; j++) { // Collect every group of predictions for current model in a FastVector FastVector predictions = new FastVector(); // For each training-testing split pair, train and test the classifier for (int i = 0; i < trainingSplits.length; i++) { Evaluation validation = classify(models[j], trainingSplits[i], testingSplits[i]); predictions.appendElements(validation.predictions()); // Uncomment to see the summary for each training-testing pair. //System.out.println(models[j].toString()); } // Calculate overall accuracy of current classifier on all splits double accuracy = calculateAccuracy(predictions); // Print current classifier's name and accuracy in a complicated, // but nice-looking way. System.out.println("Accuracy of " + models[j].getClass().getSimpleName() + ": " + String.format("%.2f%%", accuracy) + "\n---------------------------------"); } }