List of usage examples for weka.core Instance stringValue
public String stringValue(Attribute att);
From source file:etc.aloe.filters.WordFeaturesExtractor.java
License:Open Source License
@Override protected Instance process(Instance instance) throws Exception { if (selectedAttributeIndex < 0) { throw new IllegalStateException("String attribute not set"); }//ww w. j a va 2 s . c o m int numOldValues = instance.numAttributes(); int numNewFeatures = unigrams.size() + bigrams.size(); double[] newValues = new double[numOldValues + numNewFeatures]; // Copy all attributes from input to output for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (getInputFormat().attribute(i).type() != Attribute.STRING) { // Add simple nominal and numeric attributes directly if (instance.value(i) != 0.0) { newValues[i] = instance.value(i); } } else { if (instance.isMissing(i)) { newValues[i] = Utils.missingValue(); } else { // If this is a string attribute, we have to first add // this value to the range of possible values, then add // its new internal index. if (outputFormatPeek().attribute(i).numValues() == 0) { // Note that the first string value in a // SparseInstance doesn't get printed. outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug"); } int newIndex = outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i)); newValues[i] = newIndex; } } } String stringValue = instance.stringValue(selectedAttributeIndex); if (instance.isMissing(selectedAttributeIndex) == false) { List<String> words = tokenizeDocument(instance); Set<String> wordSet = new HashSet<String>(words); for (int i = 0; i < unigrams.size(); i++) { String unigram = unigrams.get(i); int count = 0; if (wordSet.contains(unigram)) { //Count the times the word is in the document for (int w = 0; w < words.size(); w++) { if (words.get(w).equals(unigram)) { count += 1; } } } int featureIndex = numOldValues + i; newValues[featureIndex] = count; } for (int i = 0; i < bigrams.size(); i++) { Bigram bigram = bigrams.get(i); int count = bigram.getTimesInDocument(words); int featureIndex = numOldValues + unigrams.size() + i; newValues[featureIndex] = count; } } Instance result = new SparseInstance(instance.weight(), newValues); return result; }
From source file:facebookpostpuller.RemoveEntriesGUI.java
private void btnDeletePostsActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_btnDeletePostsActionPerformed // TODO add your handling code here: String name = txtFacebookName.getText(); btnDeletePosts.setVisible(false);// w w w .j a v a2 s.c om btnSaveArff.setVisible(false); btnLoadArff.setVisible(false); int ctr = 0; for (int i = data.numInstances() - 1; i >= 0; i--) { Instance inst = data.instance(i); if (inst.stringValue(0).equals(name)) { System.out.println("Deleted!"); data.delete(i); ctr++; } } btnDeletePosts.setVisible(true); btnSaveArff.setVisible(true); btnLoadArff.setVisible(true); txtOutput.append(name + ": " + ctr + " posts deleted!\n"); }
From source file:filters.MauiFilter.java
License:Open Source License
private void selectCandidates() throws Exception { if (debugMode) { System.err.println("--- Computing candidates..."); }// w w w . ja va2 s . c om allCandidates = new HashMap<Instance, HashMap<String, Candidate>>(); // Convert pending input instances into data for classifier int totalDocuments = getInputFormat().numInstances(); for (int i = 0; i < totalDocuments; i++) { Instance current = getInputFormat().instance(i); String fileName = current.stringValue(fileNameAtt); int j = i + 1; if (debugMode) { System.err.println( "---- Processing document " + fileName + ", " + j + " out of " + totalDocuments + "..."); } // Get the phrases for the document String documentText = current.stringValue(documentAtt); HashMap<String, Candidate> candidateList = getCandidates(documentText); if (debugMode) { System.err.println("---- " + candidateList.size() + " candidates"); } allCandidates.put(current, candidateList); } }
From source file:filters.MauiFilter.java
License:Open Source License
/** * Builds the classifier.// w w w. j av a2 s .c om */ private void buildClassifier() throws Exception { // Generate input format for classifier FastVector atts = new FastVector(); for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (i == documentAtt) { atts.addElement(new Attribute("Term_frequency")); // 2 atts.addElement(new Attribute("IDF")); // atts.addElement(new Attribute("TFxIDF")); // atts.addElement(new Attribute("First_occurrence")); // atts.addElement(new Attribute("Last_occurrence")); // atts.addElement(new Attribute("Spread")); // atts.addElement(new Attribute("Domain_keyphraseness")); // atts.addElement(new Attribute("Length")); // atts.addElement(new Attribute("Generality")); // atts.addElement(new Attribute("Node_degree")); // atts.addElement(new Attribute("Semantic_relatedness")); // atts.addElement(new Attribute("Wikipedia_keyphraseness")); // atts.addElement(new Attribute("Inverse_Wikip_frequency")); // atts.addElement(new Attribute("Total_Wikip_keyphraseness")); // 13 } else if (i == keyphrasesAtt) { if (nominalClassValue) { FastVector vals = new FastVector(2); vals.addElement("False"); vals.addElement("True"); atts.addElement(new Attribute("Keyphrase?", vals)); } else { atts.addElement(new Attribute("Keyphrase?")); } } } classifierData = new Instances("ClassifierData", atts, 0); classifierData.setClassIndex(numFeatures); if (debugMode) { System.err.println("--- Converting instances for classifier"); } int totalDocuments = getInputFormat().numInstances(); // Convert pending input instances into data for classifier for (int i = 0; i < totalDocuments; i++) { Instance current = getInputFormat().instance(i); // Get the key phrases for the document String keyphrases = current.stringValue(keyphrasesAtt); HashMap<String, Counter> hashKeyphrases = getGivenKeyphrases(keyphrases); // Get the phrases for the document HashMap<String, Candidate> candidateList = allCandidates.get(current); // Compute the feature values for each phrase and // add the instance to the data for the classifier int countPos = 0; int countNeg = 0; if (debugMode) { System.err .println("--- Computing features for document " + i + " out of " + totalDocuments + "..."); } for (Candidate candidate : candidateList.values()) { // ignore all candidates that appear less than a threshold if (candidate.getFrequency() < minOccurFrequency) { continue; } // compute feature values double[] vals = computeFeatureValues(candidate, true, hashKeyphrases, candidateList); if (vals[vals.length - 1] == 0) { countNeg++; } else { countPos++; } Instance inst = new Instance(current.weight(), vals); // System.out.println(candidate + "\t" + inst); classifierData.add(inst); } if (debugMode) { System.err.println(countPos + " positive; " + countNeg + " negative instances"); } } if (debugMode) { System.err.println("--- Building classifier"); } if (classifier == null) { // Build classifier if (nominalClassValue) { // FilteredClassifier fclass = new FilteredClassifier(); // fclass.setClassifier(new NaiveBayesSimple()); // fclass.setFilter(new Discretize()); // classifier = fclass; classifier = new Bagging(); // try also // classifier.setOptions( Utils.splitOptions("-P 10 -S 1 -I 10 -W weka.classifiers.trees.J48 -- -U -M 2")); } else { classifier = new Bagging(); // try also // classifier.setOptions(Utils.splitOptions("-P 10 -S 1 -I 10 -W // weka.classifiers.trees.J48 -- -U -M 2")) ; String optionsString = "-P 100 -S 1 -I 10 -W weka.classifiers.trees.M5P -- -U -M 7.0"; String[] options = Utils.splitOptions(optionsString); classifier.setOptions(options); } } FileOutputStream out = new FileOutputStream(new File("19docs.arff")); PrintWriter printer = new PrintWriter(out); printer.write(classifierData.toString()); printer.close(); out.close(); classifier.buildClassifier(classifierData); if (debugMode) { System.err.println(classifier); } // Save space classifierData = new Instances(classifierData, 0); }
From source file:filters.MauiFilter.java
License:Open Source License
/** * Converts an instance.//www. java 2 s. c o m */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); String fileName = instance.stringValue(fileNameAtt); if (debugMode) { System.err.println("-- Converting instance for document " + fileName); } // Get the key phrases for the document HashMap<String, Counter> hashKeyphrases = null; if (!instance.isMissing(keyphrasesAtt)) { String keyphrases = instance.stringValue(keyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases); } // Get the document text String documentText = instance.stringValue(documentAtt); // Compute the candidate topics HashMap<String, Candidate> candidateList; if (allCandidates != null && allCandidates.containsKey(instance)) { candidateList = allCandidates.get(instance); } else { candidateList = getCandidates(documentText); } if (debugMode) { System.err.println(candidateList.size() + " candidates "); } // Set indices for key attributes int tfidfAttIndex = documentAtt + 2; int distAttIndex = documentAtt + 3; int probsAttIndex = documentAtt + numFeatures; int countPos = 0; int countNeg = 0; // Go through the phrases and convert them into instances for (Candidate candidate : candidateList.values()) { if (candidate.getFrequency() < minOccurFrequency) { continue; } String name = candidate.getName(); String orig = candidate.getBestFullForm(); if (!vocabularyName.equals("none")) { orig = candidate.getTitle(); } double[] vals = computeFeatureValues(candidate, training, hashKeyphrases, candidateList); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(classifierData); // Get probability of a phrase being key phrase double[] probs = classifier.distributionForInstance(inst); double prob = probs[0]; if (nominalClassValue) { prob = probs[1]; } // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures + 2]; int pos = 0; for (int i = 1; i < instance.numAttributes(); i++) { if (i == documentAtt) { // output of values for a given phrase: // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(name); newInst[pos++] = index; // Add original version if (orig != null) { index = outputFormatPeek().attribute(pos).addStringValue(orig); } else { index = outputFormatPeek().attribute(pos).addStringValue(name); } newInst[pos++] = index; // Add features newInst[pos++] = inst.value(tfIndex); newInst[pos++] = inst.value(idfIndex); newInst[pos++] = inst.value(tfidfIndex); newInst[pos++] = inst.value(firstOccurIndex); newInst[pos++] = inst.value(lastOccurIndex); newInst[pos++] = inst.value(spreadOccurIndex); newInst[pos++] = inst.value(domainKeyphIndex); newInst[pos++] = inst.value(lengthIndex); newInst[pos++] = inst.value(generalityIndex); newInst[pos++] = inst.value(nodeDegreeIndex); newInst[pos++] = inst.value(semRelIndex); newInst[pos++] = inst.value(wikipKeyphrIndex); newInst[pos++] = inst.value(invWikipFreqIndex); newInst[pos++] = inst.value(totalWikipKeyphrIndex); // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == keyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); if (inst.classValue() == 0) { countNeg++; } else { countPos++; } } if (debugMode) { System.err.println(countPos + " positive; " + countNeg + " negative instances"); } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and // continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current // phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } currentInstance.setValue(probsAttIndex + 1, rank++); } return vector; }
From source file:imba.classifier.NBTubes.java
@Override public double[] distributionForInstance(Instance instance) throws Exception { //Fungsi ini menentukan probabilitas setiap kelas instance untuk instance //yang ada di parameter fungsi Instances temp = null;// w ww . ja v a 2 s . c o m Instance p; Filter f; double[] a = new double[infoClassifier.get(0).get(0).size()]; int i, j, k, l, x, c; double t, prev; Enumeration n; boolean big; String val; String[] valMinMax; if (wasNumeric) { header_Instances.add(instance); f = new Normalize(); try { f.setInputFormat(header_Instances); for (Instance i1 : header_Instances) { f.input(i1); } f.batchFinished(); } catch (Exception ex) { Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex); } temp = f.getOutputFormat(); while ((p = f.output()) != null) { temp.add(p); } } f = new NumericToNominal(); if (wasNumeric) { try { f.setInputFormat(temp); for (Instance i1 : temp) { f.input(i1); } f.batchFinished(); } catch (Exception ex) { Logger.getLogger(NBTubes.class.getName()).log(Level.SEVERE, null, ex); } temp = null; temp = f.getOutputFormat(); p = null; while ((p = f.output()) != null) { temp.add(p); } instance = temp.lastInstance(); header_Instances.remove(header_Instances.size() - 1); } else { f.setInputFormat(header_Instances); f.input(instance); f.batchFinished(); instance = f.output(); } //Itung distribusi instance utk tiap kelas i = 0; while (i < (a.length)) { a[i] = (double) sumClass[i] / dataSize; j = 0; k = 0; while (j < infoClassifier.size()) { if (j == classIdx) { k++; } if (wasNumeric) { if (filter.equals("Discretize")) { l = 0; big = false; while (l < dataset.attribute(k).numValues() && big == false) { //parse val = String.valueOf(dataset.attribute(k).value(l)); //System.out.println("k = " + k); //System.out.println("nilai = " + instance.stringValue(k)); val = val.replaceAll("'", ""); val = val.replaceAll("\\(", ""); val = val.replaceAll("\\)", ""); val = val.replaceAll("]", ""); //System.out.println(val); valMinMax = val.split("-"); //cocokin if (valMinMax.length == 3) { if (valMinMax[1].equals("inf")) { valMinMax[1] = "0.0"; } //System.out.println("Min = " + valMinMax[1]); //System.out.println("Max = " + valMinMax[2]); if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[1]) && Double .valueOf(instance.stringValue(k)) <= Double.valueOf(valMinMax[2])) { big = true; } } else { if (valMinMax.length == 2) { if (valMinMax[1].equals("inf")) { valMinMax[1] = "1.0"; } if (Double.valueOf(instance.stringValue(k)) > Double.valueOf(valMinMax[0]) && Double.valueOf(instance.stringValue(k)) <= Double .valueOf(valMinMax[1])) { big = true; } } else { l = dataset.attribute(k).indexOfValue(instance.stringValue(k)); big = true; } //System.out.println("Min = " + valMinMax[0]); //System.out.println("Max = " + valMinMax[1]); } l++; } x = l - 1; //System.out.println("x = " + x); } else { big = false; l = 0; n = dataset.attribute(k).enumerateValues(); t = 0; prev = 0; while (l < dataset.attribute(k).numValues() && big == false) { t = Double.valueOf(n.nextElement().toString()); //System.out.println(prev + " " + t); if (Double.valueOf(instance.stringValue(k)) <= t) { big = true; } else { prev = t; } l++; } if (big == true && t != Double.valueOf(instance.stringValue(k))) { System.out.println(prev + " " + Double.valueOf(instance.stringValue(k)) + " " + t); } //x = l - 1; if (classIdx < 2) { c = 2; } else { c = 1; } if (big == true && l > c) { if ((Double.valueOf(instance.stringValue(k)) - prev) <= (t - Double.valueOf(instance.stringValue(k)))) { x = l - 2; } else { x = l - 1; } } else { x = l - 1; } } } else { x = dataset.attribute(k).indexOfValue(instance.stringValue(k)); } a[i] *= infoClassifier.get(j).get(x).get(i); k++; j++; } i++; } return a; }
From source file:jjj.asap.sas.parser.job.ImportParserData.java
License:Open Source License
private void process(final String parent, int essaySet, Map<Double, List<String>> tags, Map<Double, List<String>> parseTrees, Map<Double, List<String>> depends) { // check if output exists boolean any = false; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-extra-stats.arff")) any = true;// ww w. ja v a2s . c om if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-pos-tags.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-parse-tree.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends0.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends1.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends2.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends3.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends4.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends5.arff")) any = true; if (!IOUtils.exists("work/datasets/" + parent + "/" + essaySet + "-depends6.arff")) any = true; if (!any) { Job.log("NOTE", "work/datasets/" + parent + "/" + essaySet + "-*.arff returns all required datasets - nothing to do"); return; } // Load an existing dataset to use as a template. Instances dataset = Dataset.load("work/datasets/" + parent + "/" + essaySet + "-spell-checked.arff"); // create the output datasets here. except for the extra statistics, // the format is the same as 'dataset'. Instances tagsData = new Instances(dataset, 0); tagsData.setRelationName(essaySet + "-pos-tags.arff"); Instances treeData = new Instances(dataset, 0); treeData.setRelationName(essaySet + "-parse-tree.arff"); Instances dependsData[] = new Instances[7]; for (int j = 0; j < 7; j++) { dependsData[j] = new Instances(dataset, 0); dependsData[j].setRelationName(essaySet + "-depends" + j + ".arff"); } // extra stats DatasetBuilder builder = new DatasetBuilder(); builder.addVariable("id"); if (Contest.isMultiChoice(essaySet)) { builder.addNominalVariable("color", Contest.COLORS); } builder.addVariable("x_sent"); builder.addVariable("x_para"); builder.addVariable("x_length"); builder.addVariable("x_words"); builder.addVariable("x_unique_words"); builder.addNominalVariable("score", Contest.getRubrics(essaySet)); Instances extraStats = builder.getDataset(essaySet + "-extra-stats.arff"); // now add rows for each instance for (int i = 0; i < dataset.numInstances(); i++) { // common variables Instance ob = dataset.instance(i); double id = ob.value(0); String y = ob.isMissing(dataset.numAttributes() - 1) ? null : ob.stringValue(dataset.numAttributes() - 1); String color = Contest.isMultiChoice(essaySet) ? ob.stringValue(dataset.attribute("color")) : null; String str = ob.stringValue(dataset.attribute("text")); // // Extra stats // int nSent = tags.containsKey(id) ? tags.get(id).size() : 0; int nPara = 0; for (int a = 0; a < str.length(); a++) { if (str.charAt(a) == '^') nPara++; } int nLength = str.length(); int nWords = 0; int nUniqueWords = 0; String[] words = str.toLowerCase().split(" "); nWords = words.length; Set<String> u = new HashSet<String>(); for (String w : words) { u.add(w); } nUniqueWords = u.size(); extraStats.add(new DenseInstance(extraStats.numAttributes())); Instance extra = extraStats.lastInstance(); extra.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { extra.setValue(1, color); } extra.setValue(extraStats.attribute("x_sent"), nSent); extra.setValue(extraStats.attribute("x_para"), nPara); extra.setValue(extraStats.attribute("x_length"), nLength); extra.setValue(extraStats.attribute("x_words"), nWords); extra.setValue(extraStats.attribute("x_unique_words"), nUniqueWords); if (y == null) extra.setValue(extraStats.numAttributes() - 1, Utils.missingValue()); else extra.setValue(extraStats.numAttributes() - 1, y); // // POS tags // String tagsText = ""; List<String> tagsList = tags.get(id); if (tagsList == null || tagsList.isEmpty()) { Job.log("WARNING", "no tags for " + id); tagsText = "x"; } else { for (String tagsItem : tagsList) { tagsText += tagsItem; } } tagsData.add(new DenseInstance(ob.numAttributes())); Instance tagsOb = tagsData.lastInstance(); tagsOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { tagsOb.setValue(1, color); tagsOb.setValue(2, tagsText.trim()); if (y == null) { tagsOb.setValue(3, Utils.missingValue()); } else { tagsOb.setValue(3, y); } } else { tagsOb.setValue(1, tagsText.trim()); if (y == null) { tagsOb.setValue(2, Utils.missingValue()); } else { tagsOb.setValue(2, y); } } // // Parse Tree // String treeText = ""; List<String> treeList = parseTrees.get(id); if (treeList == null || treeList.isEmpty()) { Job.log("WARNING", "no parse tree for " + id); treeText = "x"; } else { for (String treeItem : treeList) { treeText += treeItem; } } treeData.add(new DenseInstance(ob.numAttributes())); Instance treeOb = treeData.lastInstance(); treeOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { treeOb.setValue(1, color); treeOb.setValue(2, treeText.trim()); if (y == null) { treeOb.setValue(3, Utils.missingValue()); } else { treeOb.setValue(3, y); } } else { treeOb.setValue(1, treeText.trim()); if (y == null) { treeOb.setValue(2, Utils.missingValue()); } else { treeOb.setValue(2, y); } } // // Depends data // for (int j = 0; j < 7; j++) { String text = ""; List<String> list = depends.get(id); if (list == null || list.isEmpty()) { Job.log("WARNING", "no depends for " + id); text = "x"; } else { for (String item : list) { String[] term = StringUtils.safeSplit(item, "/", 3); switch (j) { case 0: text += item; break; case 1: text += term[1] + "/" + term[2]; break; case 2: text += term[0] + "/" + term[2]; break; case 3: text += term[0] + "/" + term[1]; break; case 4: text += term[0]; break; case 5: text += term[1]; break; case 6: text += term[2]; break; } text += " "; } } dependsData[j].add(new DenseInstance(ob.numAttributes())); Instance dependsOb = dependsData[j].lastInstance(); dependsOb.setValue(0, id); if (Contest.isMultiChoice(essaySet)) { dependsOb.setValue(1, color); dependsOb.setValue(2, text.trim()); if (y == null) { dependsOb.setValue(3, Utils.missingValue()); } else { dependsOb.setValue(3, y); } } else { dependsOb.setValue(1, text.trim()); if (y == null) { dependsOb.setValue(2, Utils.missingValue()); } else { dependsOb.setValue(2, y); } } } // j } // dataset // Now save the new datasets Dataset.save("work/datasets/" + parent + "/" + tagsData.relationName(), tagsData); Dataset.save("work/datasets/" + parent + "/" + treeData.relationName(), treeData); for (int j = 0; j < 7; j++) { Dataset.save("work/datasets/" + parent + "/" + dependsData[j].relationName(), dependsData[j]); } Dataset.save("work/datasets/" + parent + "/" + extraStats.relationName(), extraStats); }
From source file:kea.KEAFilter.java
License:Open Source License
/** * Builds the classifier./* www .j a va 2 s .c om*/ */ private void buildClassifier() throws Exception { // Generate input format for classifier FastVector atts = new FastVector(); for (int i = 0; i < getInputFormat().numAttributes(); i++) { if (i == m_DocumentAtt) { atts.addElement(new Attribute("TFxIDF")); atts.addElement(new Attribute("First_occurrence")); if (m_KFused) { atts.addElement(new Attribute("Keyphrase_frequency")); } } else if (i == m_KeyphrasesAtt) { FastVector vals = new FastVector(2); vals.addElement("False"); vals.addElement("True"); atts.addElement(new Attribute("Keyphrase?", vals)); } } m_ClassifierData = new Instances("ClassifierData", atts, 0); m_ClassifierData.setClassIndex(m_NumFeatures); if (m_Debug) { System.err.println("--- Converting instances for classifier"); } // Convert pending input instances into data for classifier for (int i = 0; i < getInputFormat().numInstances(); i++) { Instance current = getInputFormat().instance(i); // Get the key phrases for the document String keyphrases = current.stringValue(m_KeyphrasesAtt); HashMap hashKeyphrases = getGivenKeyphrases(keyphrases, false); HashMap hashKeysEval = getGivenKeyphrases(keyphrases, true); // Get the phrases for the document HashMap hash = new HashMap(); int length = getPhrases(hash, current.stringValue(m_DocumentAtt)); // Compute the feature values for each phrase and // add the instance to the data for the classifier Iterator it = hash.keySet().iterator(); while (it.hasNext()) { String phrase = (String) it.next(); FastVector phraseInfo = (FastVector) hash.get(phrase); double[] vals = featVals(phrase, phraseInfo, true, hashKeysEval, hashKeyphrases, length); Instance inst = new Instance(current.weight(), vals); m_ClassifierData.add(inst); } } if (m_Debug) { System.err.println("--- Building classifier"); } // Build classifier FilteredClassifier fclass = new FilteredClassifier(); fclass.setClassifier(new NaiveBayesSimple()); fclass.setFilter(new Discretize()); m_Classifier = fclass; m_Classifier.buildClassifier(m_ClassifierData); if (m_Debug) { System.err.println(m_Classifier); } // Save space m_ClassifierData = new Instances(m_ClassifierData, 0); }
From source file:kea.KEAFilter.java
License:Open Source License
/** * Converts an instance.//from w ww. j a va2s . c o m */ private FastVector convertInstance(Instance instance, boolean training) throws Exception { FastVector vector = new FastVector(); if (m_Debug) { System.err.println("-- Converting instance"); } // Get the key phrases for the document HashMap hashKeyphrases = null; HashMap hashKeysEval = null; if (!instance.isMissing(m_KeyphrasesAtt)) { String keyphrases = instance.stringValue(m_KeyphrasesAtt); hashKeyphrases = getGivenKeyphrases(keyphrases, false); hashKeysEval = getGivenKeyphrases(keyphrases, true); } // Get the phrases for the document HashMap hash = new HashMap(); int length = getPhrases(hash, instance.stringValue(m_DocumentAtt)); // Compute number of extra attributes int numFeatures = 5; if (m_Debug) { if (m_KFused) { numFeatures = numFeatures + 1; } } // Set indices of key attributes int phraseAttIndex = m_DocumentAtt; int tfidfAttIndex = m_DocumentAtt + 2; int distAttIndex = m_DocumentAtt + 3; int probsAttIndex = m_DocumentAtt + numFeatures - 1; // Go through the phrases and convert them into instances Iterator it = hash.keySet().iterator(); while (it.hasNext()) { String phrase = (String) it.next(); FastVector phraseInfo = (FastVector) hash.get(phrase); double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length); Instance inst = new Instance(instance.weight(), vals); inst.setDataset(m_ClassifierData); // Get probability of phrase being key phrase double[] probs = m_Classifier.distributionForInstance(inst); double prob = probs[1]; // Compute attribute values for final instance double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2)); newInst[pos++] = index; // Add TFxIDF newInst[pos++] = inst.value(m_TfidfIndex); // Add distance newInst[pos++] = inst.value(m_FirstOccurIndex); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = inst.value(m_KeyFreqIndex); } } // Add probability probsAttIndex = pos; newInst[pos++] = prob; // Set rank to missing (computed below) newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = inst.classValue(); } else { newInst[pos++] = instance.value(i); } } Instance ins = new Instance(instance.weight(), newInst); ins.setDataset(outputFormatPeek()); vector.addElement(ins); } // Add dummy instances for keyphrases that don't occur // in the document if (hashKeysEval != null) { Iterator phrases = hashKeysEval.keySet().iterator(); while (phrases.hasNext()) { String phrase = (String) phrases.next(); double[] newInst = new double[instance.numAttributes() + numFeatures]; int pos = 0; for (int i = 0; i < instance.numAttributes(); i++) { if (i == m_DocumentAtt) { // Add phrase int index = outputFormatPeek().attribute(pos).addStringValue(phrase); newInst[pos++] = (double) index; // Add original version index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase)); newInst[pos++] = (double) index; // Add TFxIDF newInst[pos++] = Instance.missingValue(); // Add distance newInst[pos++] = Instance.missingValue(); // Add other features if (m_Debug) { if (m_KFused) { newInst[pos++] = Instance.missingValue(); } } // Add probability and rank newInst[pos++] = -Double.MAX_VALUE; newInst[pos++] = Instance.missingValue(); } else if (i == m_KeyphrasesAtt) { newInst[pos++] = 1; // Keyphrase } else { newInst[pos++] = instance.value(i); } } Instance inst = new Instance(instance.weight(), newInst); inst.setDataset(outputFormatPeek()); vector.addElement(inst); } } // Sort phrases according to their distance (stable sort) double[] vals = new double[vector.size()]; for (int i = 0; i < vals.length; i++) { vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex); } FastVector newVector = new FastVector(vector.size()); int[] sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their tfxidf value (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Sort phrases according to their probability (stable sort) for (int i = 0; i < vals.length; i++) { vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex); } newVector = new FastVector(vector.size()); sortedIndices = Utils.stableSort(vals); for (int i = 0; i < vals.length; i++) { newVector.addElement(vector.elementAt(sortedIndices[i])); } vector = newVector; // Compute rank of phrases. Check for subphrases that are ranked // lower than superphrases and assign probability -1 and set the // rank to Integer.MAX_VALUE int rank = 1; for (int i = 0; i < vals.length; i++) { Instance currentInstance = (Instance) vector.elementAt(i); // Short cut: if phrase very unlikely make rank very low and continue if (Utils.grOrEq(vals[i], 1.0)) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); continue; } // Otherwise look for super phrase starting with first phrase // in list that has same probability, TFxIDF value, and distance as // current phrase. We do this to catch all superphrases // that have same probability, TFxIDF value and distance as current phrase. int startInd = i; while (startInd < vals.length) { Instance inst = (Instance) vector.elementAt(startInd); if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex)) || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex)) || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) { break; } startInd++; } String val = currentInstance.stringValue(phraseAttIndex); boolean foundSuperphrase = false; for (int j = startInd - 1; j >= 0; j--) { if (j != i) { Instance candidate = (Instance) vector.elementAt(j); String potSuperphrase = candidate.stringValue(phraseAttIndex); if (val.length() <= potSuperphrase.length()) { if (KEAFilter.contains(val, potSuperphrase)) { foundSuperphrase = true; break; } } } } if (foundSuperphrase) { currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE); } else { currentInstance.setValue(probsAttIndex + 1, rank++); } } return vector; }
From source file:kea.KEAPhraseFilter.java
License:Open Source License
/** * Converts an instance by removing all non-alphanumeric characters * from its string attribute values./* www.j a v a2 s .c om*/ */ private void convertInstance(Instance instance) throws Exception { double[] instVals = new double[instance.numAttributes()]; for (int i = 0; i < instance.numAttributes(); i++) { if (!instance.attribute(i).isString() || instance.isMissing(i)) { instVals[i] = instance.value(i); } else { if (!m_SelectCols.isInRange(i)) { int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i)); instVals[i] = (double) index; continue; } String str = instance.stringValue(i); StringBuffer resultStr = new StringBuffer(); int j = 0; boolean phraseStart = true; boolean seenNewLine = false; boolean haveSeenHyphen = false; boolean haveSeenSlash = false; while (j < str.length()) { boolean isWord = false; boolean potNumber = false; int startj = j; while (j < str.length()) { char ch = str.charAt(j); if (Character.isLetterOrDigit(ch)) { potNumber = true; if (Character.isLetter(ch)) { isWord = true; } j++; } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_') || (ch == '&') || (ch == '/') || (ch == '-')) { if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1)) && Character.isLetterOrDigit(str.charAt(j + 1))) { j++; } else { break; } } else if (ch == '\'') { if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) { j++; } else { break; } } else { break; } } if (isWord == true) { if (!phraseStart) { if (haveSeenHyphen) { resultStr.append('-'); } else if (haveSeenSlash) { resultStr.append('/'); } else { resultStr.append(' '); } } resultStr.append(str.substring(startj, j)); if (j == str.length()) { break; } phraseStart = false; seenNewLine = false; haveSeenHyphen = false; haveSeenSlash = false; if (Character.isWhitespace(str.charAt(j))) { if (str.charAt(j) == '\n') { seenNewLine = true; } } else if (str.charAt(j) == '-') { haveSeenHyphen = true; } else if (str.charAt(j) == '/') { haveSeenSlash = true; } else { phraseStart = true; resultStr.append('\n'); } j++; } else if (j == str.length()) { break; } else if (str.charAt(j) == '\n') { if (seenNewLine) { if (phraseStart == false) { resultStr.append('\n'); phraseStart = true; } } else if (potNumber) { if (phraseStart == false) { phraseStart = true; resultStr.append('\n'); } } seenNewLine = true; j++; } else if (Character.isWhitespace(str.charAt(j))) { if (potNumber) { if (phraseStart == false) { phraseStart = true; resultStr.append('\n'); } } j++; } else { if (phraseStart == false) { resultStr.append('\n'); phraseStart = true; } j++; } } int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString()); instVals[i] = (double) index; } } Instance inst = new Instance(instance.weight(), instVals); inst.setDataset(getOutputFormat()); push(inst); }