List of usage examples for weka.core Instances add
@Override public boolean add(Instance instance)
From source file:Prediccion.PrecidePasoNodo.java
License:Open Source License
Instances cargarDatos(int hora) throws ParseException { //Declaramos los atributos de las instancias Attribute a0 = new Attribute("Intervalo", "yyyy-MM-dd HH:mm:ss"); Attribute a1 = new Attribute("Total"); ArrayList<Attribute> c = new ArrayList<>(); c.add(a0);//from ww w. j a v a 2 s . co m c.add(a1); //Creamos el conjunto de instancias Instances instances = new Instances(nodo, c, 1000); //Instanciamos conexion con FT cFT = new conectarFusionTables(); Sqlresponse r = cFT.select(TABLAID, "Intervalo, Total", "idNodo = " + nodo + " and Intervalo ENDS WITH '00:00:00'", "ORDER BY \'Intervalo\' DESC LIMIT 10000"); for (List<Object> a : r.getRows()) { Instance i = new DenseInstance(2); String s0 = (String) a.get(0); String s1 = (String) a.get(1); System.err.println(s0 + " ->" + s1); i.setValue(instances.attribute(0), instances.attribute(0).parseDate(s0)); i.setValue(instances.attribute(1), Integer.parseInt(s1)); instances.add(i); } instances.sort(0); return instances; }
From source file:preprocess.TextDirectoryLoader.java
License:Open Source License
/** * Return the full data set. If the structure hasn't yet been determined * by a call to getStructure then method should do so before processing * the rest of the data set.//from w w w.jav a 2s.co m * * @return the structure of the data set as an empty set of Instances * @throws IOException if there is no source or parsing fails */ public Instances getDataSet() throws IOException { if (getDirectory() == null) throw new IOException("No directory/source has been specified"); String directoryPath = getDirectory().getAbsolutePath(); FastVector classes = new FastVector(); Enumeration enm = getStructure().classAttribute().enumerateValues(); while (enm.hasMoreElements()) classes.addElement(enm.nextElement()); Instances data = getStructure(); int fileCount = 0; for (int k = 0; k < classes.size(); k++) { String subdirPath = (String) classes.elementAt(k); File subdir = new File(directoryPath + File.separator + subdirPath); String[] files = subdir.list(); for (int j = 0; j < files.length; j++) { try { fileCount++; if (getDebug()) System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]); double[] newInst = null; if (m_OutputFilename) newInst = new double[3]; else newInst = new double[2]; File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]); BufferedInputStream is; is = new BufferedInputStream(new FileInputStream(txt)); StringBuffer txtStr = new StringBuffer(); FileReader fr = new FileReader(txt); BufferedReader br = new BufferedReader(fr); String line; while ((line = br.readLine()) != null) { txtStr.append(line + "\n"); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); if (m_OutputFilename) newInst[1] = (double) data.attribute(1) .addStringValue(subdirPath + File.separator + files[j]); newInst[data.classIndex()] = (double) k; data.add(new Instance(1.0, newInst)); is.close(); } catch (Exception e) { System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath + File.separator + files[j]); } } } return data; }
From source file:preprocess.TextDirectoryLoaderEX.java
License:Open Source License
/** * Return the full data set. If the structure hasn't yet been determined * by a call to getStructure then method should do so before processing * the rest of the data set.//from www.j av a 2 s . c o m * * @return the structure of the data set as an empty set of Instances * @throws IOException if there is no source or parsing fails */ public Instances getDataSet() throws IOException { if (getDirectory() == null) throw new IOException("No directory/source has been specified"); String directoryPath = getDirectory().getAbsolutePath(); FastVector classes = new FastVector(); Enumeration enm = getStructure().classAttribute().enumerateValues(); while (enm.hasMoreElements()) classes.addElement(enm.nextElement()); Instances data = getStructure(); int fileCount = 0; for (int k = 0; k < classes.size(); k++) { String subdirPath = (String) classes.elementAt(k); File subdir = new File(directoryPath + File.separator + subdirPath); String[] files = subdir.list(); for (int j = 0; j < files.length; j++) { try { fileCount++; if (getDebug()) System.err.println("processing " + fileCount + " : " + subdirPath + " : " + files[j]); double[] newInst = null; if (m_OutputFilename) newInst = new double[3]; else newInst = new double[2]; File txt = new File(directoryPath + File.separator + subdirPath + File.separator + files[j]); BufferedInputStream is; is = new BufferedInputStream(new FileInputStream(txt)); StringBuffer txtStr = new StringBuffer(); /* int c; while ((c = is.read()) != -1) { txtStr.append((char) c); }*/ FileReader fr = new FileReader(txt); BufferedReader br = new BufferedReader(fr); String line; while ((line = br.readLine()) != null) { txtStr.append(line + "\n"); } br.close(); fr.close(); newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); if (m_OutputFilename) newInst[0] = Integer.valueOf(files[j].toString()); newInst[data.classIndex()] = (double) k; data.add(new Instance(1.0, newInst)); is.close(); } catch (Exception e) { System.err.println("failed to convert file: " + directoryPath + File.separator + subdirPath + File.separator + files[j]); } } } return data; }
From source file:probcog.J48Reader.java
License:Open Source License
public static Instances readDB(String dbname) throws IOException, ClassNotFoundException, DDException, FileNotFoundException, Exception { Database db = Database.fromFile(new FileInputStream(dbname)); probcog.srldb.datadict.DataDictionary dd = db.getDataDictionary(); //the vector of attributes FastVector fvAttribs = new FastVector(); HashMap<String, Attribute> mapAttrs = new HashMap<String, Attribute>(); for (DDAttribute attribute : dd.getObject("object").getAttributes().values()) { if (attribute.isDiscarded() && !attribute.getName().equals("objectT")) { continue; }// ww w . j av a2 s . c om FastVector attValues = new FastVector(); Domain dom = attribute.getDomain(); for (String s : dom.getValues()) attValues.addElement(s); Attribute attr = new Attribute(attribute.getName(), attValues); fvAttribs.addElement(attr); mapAttrs.put(attribute.getName(), attr); } Instances instances = new Instances("name", fvAttribs, 10000); instances.setClass(mapAttrs.get("objectT")); //for each object add an instance for (Object o : db.getObjects()) { if (o.hasAttribute("objectT")) { Instance instance = new Instance(fvAttribs.size()); for (Entry<String, String> e : o.getAttributes().entrySet()) { if (!dd.getAttribute(e.getKey()).isDiscarded()) { instance.setValue(mapAttrs.get(e.getKey()), e.getValue()); } } instances.add(instance); } } return instances; }
From source file:project.MauiModelBuilder.java
License:Open Source License
/** * Builds the model from the training data *//* w w w. j av a 2s.c o m*/ public void buildModel(HashSet<String> fileNames) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } System.err.println("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model mauiFilter = new MauiFilter(); mauiFilter.setDebug(getDebug()); mauiFilter.setMaxPhraseLength(getMaxPhraseLength()); mauiFilter.setMinPhraseLength(getMinPhraseLength()); mauiFilter.setMinNumOccur(getMinNumOccur()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setBasicWikipediaFeatures(useBasicWikipediaFeatures); mauiFilter.setAllWikipediaFeatures(useAllWikipediaFeatures); mauiFilter.setThesaurusFeatures(useNodeDegreeFeature); mauiFilter.setClassifier(classifier); mauiFilter.setContextSize(contextSize); mauiFilter.setMinKeyphraseness(minKeyphraseness); mauiFilter.setMinSenseProbability(minSenseProbability); if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords()); } System.err.println("-- Reading the input documents... "); for (String fileName : fileNames) { double[] newInst = new double[3]; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTopicsFile); e.printStackTrace(); newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) { } ; }
From source file:project.MauiTopicExtractor.java
License:Open Source License
/** * Builds the model from the files/*from www . j a v a2 s .c o m*/ */ public void extractKeyphrases(HashSet<String> fileNames) throws Exception { // Check whether there is actually any data if (fileNames.size() == 0) { throw new Exception("Couldn't find any data in " + inputDirectoryName); } mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords()); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); System.err.println("-- Extracting keyphrases... "); Vector<Double> correctStatistics = new Vector<Double>(); Vector<Double> precisionStatistics = new Vector<Double>(); Vector<Double> recallStatistics = new Vector<Double>(); for (String fileName : fileNames) { double[] newInst = new double[3]; System.out.println("fileName print out: " + fileName); newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTextFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTextFile)); } // Reading the file content StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(txtStr.toString()); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(keyStr.toString()); } catch (Exception e) { if (debugMode) { System.err.println("No existing topics for " + documentTextFile); } newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (debugMode) { System.err.println("-- Processing document: " + fileName); } Instance[] topRankedInstances = new Instance[topicsPerDocument]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = mauiFilter.output()) != null) { int index = (int) inst.value(mauiFilter.getRankIndex()) - 1; if (index < topicsPerDocument) { topRankedInstances[index] = inst; } } if (debugMode) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; if (!documentTopicsFile.exists()) { out = new FileOutputStream(documentTopicsFile); if (!documentEncoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; wikipedia = mauiFilter.getWikipedia(); HashMap<Article, Integer> topics = null; if (printGraph) { topics = new HashMap<Article, Integer>(); } int p = 0; String root = ""; for (int i = 0; i < topicsPerDocument; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { String topic = topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex()); printer.print(topic); if (printGraph) { Article article = wikipedia.getArticleByTitle(topic); if (article == null) { article = wikipedia.getMostLikelyArticle(topic, new CaseFolder()); } if (article != null) { if (root == "") { root = article.getTitle(); } topics.put(article, new Integer(p)); } else { if (debugMode) { System.err.println( "Couldn't find article for " + topic + " in " + documentTopicsFile); } } p++; } if (additionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(mauiFilter.getNormalizedFormIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(mauiFilter.getProbabilityIndex()), 4)); } printer.println(); } if (debugMode) { System.err.println(topRankedInstances[i]); } } } if (printGraph) { String graphFile = documentTopicsFile.getAbsolutePath().replace(".key", ".gv"); computeGraph(topics, root, graphFile); } if (numExtracted > 0) { if (debugMode) { System.err.println("-- " + numCorrect + " correct"); } double totalCorrect = mauiFilter.getTotalCorrect(); correctStatistics.addElement(new Double(numCorrect)); precisionStatistics.addElement(new Double(numCorrect / numExtracted)); recallStatistics.addElement(new Double(numCorrect / totalCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } for (int i = 0; i < topicsPerDocument; i++) { System.out.println(topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex())); } } if (correctStatistics.size() != 0) { double[] st = new double[correctStatistics.size()]; for (int i = 0; i < correctStatistics.size(); i++) { st[i] = correctStatistics.elementAt(i).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); if (correctStatistics.size() == 1) { System.err.println("\n-- Evaluation results based on 1 document:"); } else { System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:"); } System.err.println("Avg. number of correct keyphrases per document: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); st = new double[precisionStatistics.size()]; for (int i = 0; i < precisionStatistics.size(); i++) { st[i] = precisionStatistics.elementAt(i).doubleValue(); } double avgPrecision = Utils.mean(st); double stdDevPrecision = Math.sqrt(Utils.variance(st)); System.err.println("Precision: " + Utils.doubleToString(avgPrecision * 100, 2) + " +/- " + Utils.doubleToString(stdDevPrecision * 100, 2)); st = new double[recallStatistics.size()]; for (int i = 0; i < recallStatistics.size(); i++) { st[i] = recallStatistics.elementAt(i).doubleValue(); } double avgRecall = Utils.mean(st); double stdDevRecall = Math.sqrt(Utils.variance(st)); System.err.println("Recall: " + Utils.doubleToString(avgRecall * 100, 2) + " +/- " + Utils.doubleToString(stdDevRecall * 100, 2)); double fMeasure = 2 * avgRecall * avgPrecision / (avgRecall + avgPrecision); System.err.println("F-Measure: " + Utils.doubleToString(fMeasure * 100, 2)); System.err.println(""); } mauiFilter.batchFinished(); }
From source file:project.MauiTopicExtractor.java
License:Open Source License
/** * Builds the model from the files/*w w w. j ava 2 s. c om*/ */ public LinkedList<String> extractKeyphrases(String abstractContents) throws Exception { mauiFilter.setVocabularyName(getVocabularyName()); mauiFilter.setVocabularyFormat(getVocabularyFormat()); mauiFilter.setDocumentLanguage(getDocumentLanguage()); mauiFilter.setStemmer(getStemmer()); mauiFilter.setStopwords(getStopwords()); if (wikipedia != null) { mauiFilter.setWikipedia(wikipedia); } else if (wikipediaServer.equals("localhost") && wikipediaDatabase.equals("database")) { mauiFilter.setWikipedia(wikipedia); } else { mauiFilter.setWikipedia(wikipediaServer, wikipediaDatabase, cacheWikipediaData, wikipediaDataDirectory); } if (!vocabularyName.equals("none") && !vocabularyName.equals("wikipedia")) { mauiFilter.loadThesaurus(getStemmer(), getStopwords()); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); System.err.println("-- Extracting keyphrases... "); Vector<Double> correctStatistics = new Vector<Double>(); Vector<Double> precisionStatistics = new Vector<Double>(); Vector<Double> recallStatistics = new Vector<Double>(); boolean doneAlready = false; while (!doneAlready) { double[] newInst = new double[3]; String fileName = "cailen"; newInst[0] = (double) data.attribute(0).addStringValue(fileName); ; File documentTextFile = new File(inputDirectoryName + "/" + fileName + ".txt"); File documentTopicsFile = new File(inputDirectoryName + "/" + fileName + ".key"); try { // Adding the text of the document to the instance newInst[1] = (double) data.attribute(1).addStringValue(abstractContents); } catch (Exception e) { System.err.println("Problem with reading " + documentTextFile); e.printStackTrace(); newInst[1] = Instance.missingValue(); } try { InputStreamReader is; if (!documentEncoding.equals("default")) { is = new InputStreamReader(new FileInputStream(documentTopicsFile), documentEncoding); } else { is = new InputStreamReader(new FileInputStream(documentTopicsFile)); } // Reading the content of the keyphrase file StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } // Adding the topics to the file newInst[2] = (double) data.attribute(2).addStringValue(abstractContents); } catch (Exception e) { if (debugMode) { System.err.println("No existing topics for " + documentTextFile); } newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (debugMode) { System.err.println("-- Processing document: " + fileName); } Instance[] topRankedInstances = new Instance[topicsPerDocument]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = mauiFilter.output()) != null) { int index = (int) inst.value(mauiFilter.getRankIndex()) - 1; if (index < topicsPerDocument) { topRankedInstances[index] = inst; } } if (debugMode) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; if (!documentTopicsFile.exists()) { out = new FileOutputStream(documentTopicsFile); if (!documentEncoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, documentEncoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; wikipedia = mauiFilter.getWikipedia(); HashMap<Article, Integer> topics = null; if (printGraph) { topics = new HashMap<Article, Integer>(); } int p = 0; String root = ""; for (int i = 0; i < topicsPerDocument; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { String topic = topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex()); printer.print(topic); if (printGraph) { Article article = wikipedia.getArticleByTitle(topic); if (article == null) { article = wikipedia.getMostLikelyArticle(topic, new CaseFolder()); } if (article != null) { if (root == "") { root = article.getTitle(); } topics.put(article, new Integer(p)); } else { if (debugMode) { System.err.println( "Couldn't find article for " + topic + " in " + documentTopicsFile); } } p++; } if (additionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(mauiFilter.getNormalizedFormIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(mauiFilter.getProbabilityIndex()), 4)); } printer.println(); } if (debugMode) { System.err.println(topRankedInstances[i]); } } } if (printGraph) { String graphFile = documentTopicsFile.getAbsolutePath().replace(".key", ".gv"); computeGraph(topics, root, graphFile); } if (numExtracted > 0) { if (debugMode) { System.err.println("-- " + numCorrect + " correct"); } double totalCorrect = mauiFilter.getTotalCorrect(); correctStatistics.addElement(new Double(numCorrect)); precisionStatistics.addElement(new Double(numCorrect / numExtracted)); recallStatistics.addElement(new Double(numCorrect / totalCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } for (int i = 0; i < topicsPerDocument; i++) { topicsList.add(topRankedInstances[i].stringValue(mauiFilter.getOutputFormIndex())); } doneAlready = true; } if (correctStatistics.size() != 0) { double[] st = new double[correctStatistics.size()]; for (int i = 0; i < correctStatistics.size(); i++) { st[i] = correctStatistics.elementAt(i).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); if (correctStatistics.size() == 1) { System.err.println("\n-- Evaluation results based on 1 document:"); } else { System.err.println("\n-- Evaluation results based on " + correctStatistics.size() + " documents:"); } System.err.println("Avg. number of correct keyphrases per document: " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); st = new double[precisionStatistics.size()]; for (int i = 0; i < precisionStatistics.size(); i++) { st[i] = precisionStatistics.elementAt(i).doubleValue(); } double avgPrecision = Utils.mean(st); double stdDevPrecision = Math.sqrt(Utils.variance(st)); System.err.println("Precision: " + Utils.doubleToString(avgPrecision * 100, 2) + " +/- " + Utils.doubleToString(stdDevPrecision * 100, 2)); st = new double[recallStatistics.size()]; for (int i = 0; i < recallStatistics.size(); i++) { st[i] = recallStatistics.elementAt(i).doubleValue(); } double avgRecall = Utils.mean(st); double stdDevRecall = Math.sqrt(Utils.variance(st)); System.err.println("Recall: " + Utils.doubleToString(avgRecall * 100, 2) + " +/- " + Utils.doubleToString(stdDevRecall * 100, 2)); double fMeasure = 2 * avgRecall * avgPrecision / (avgRecall + avgPrecision); System.err.println("F-Measure: " + Utils.doubleToString(fMeasure * 100, 2)); System.err.println(""); } mauiFilter.batchFinished(); return topicsList; }
From source file:py.fpuna.lib.ExtendedInstanceQuery.java
License:Open Source License
/** * Makes a database query to convert a table into a set of instances * * @param query the query to convert to instances * @return the instances contained in the result of the query, NULL if the * SQL query doesn't return a ResultSet, e.g., DELETE/INSERT/UPDATE * @throws Exception if an error occurs// ww w . ja v a 2 s .c om */ public Instances retrieveInstances(String query) throws Exception { if (m_Debug) System.err.println("Executing query: " + query); connectToDatabase(); if (execute(query) == false) { if (m_PreparedStatement.getUpdateCount() == -1) { throw new Exception("Query didn't produce results"); } else { if (m_Debug) System.err.println(m_PreparedStatement.getUpdateCount() + " rows affected."); close(); return null; } } ResultSet rs = getResultSet(); if (m_Debug) System.err.println("Getting metadata..."); ResultSetMetaData md = rs.getMetaData(); if (m_Debug) System.err.println("Completed getting metadata..."); // Determine structure of the instances int numAttributes = md.getColumnCount(); int[] attributeTypes = new int[numAttributes]; Hashtable[] nominalIndexes = new Hashtable[numAttributes]; FastVector[] nominalStrings = new FastVector[numAttributes]; for (int i = 1; i <= numAttributes; i++) { /* switch (md.getColumnType(i)) { case Types.CHAR: case Types.VARCHAR: case Types.LONGVARCHAR: case Types.BINARY: case Types.VARBINARY: case Types.LONGVARBINARY:*/ switch (translateDBColumnType(md.getColumnTypeName(i))) { case STRING: //System.err.println("String --> nominal"); attributeTypes[i - 1] = Attribute.NOMINAL; nominalIndexes[i - 1] = new Hashtable(); nominalStrings[i - 1] = new FastVector(); break; case TEXT: //System.err.println("Text --> string"); attributeTypes[i - 1] = Attribute.STRING; nominalIndexes[i - 1] = new Hashtable(); nominalStrings[i - 1] = new FastVector(); break; case BOOL: //System.err.println("boolean --> nominal"); attributeTypes[i - 1] = Attribute.NOMINAL; nominalIndexes[i - 1] = new Hashtable(); nominalIndexes[i - 1].put("false", new Double(0)); nominalIndexes[i - 1].put("true", new Double(1)); nominalStrings[i - 1] = new FastVector(); nominalStrings[i - 1].addElement("false"); nominalStrings[i - 1].addElement("true"); break; case DOUBLE: //System.err.println("BigDecimal --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case BYTE: //System.err.println("byte --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case SHORT: //System.err.println("short --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case INTEGER: //System.err.println("int --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case LONG: //System.err.println("long --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case FLOAT: //System.err.println("float --> numeric"); attributeTypes[i - 1] = Attribute.NUMERIC; break; case DATE: attributeTypes[i - 1] = Attribute.DATE; break; case TIME: attributeTypes[i - 1] = Attribute.DATE; break; default: //System.err.println("Unknown column type"); attributeTypes[i - 1] = Attribute.STRING; } } // For sqlite // cache column names because the last while(rs.next()) { iteration for // the tuples below will close the md object: Vector<String> columnNames = new Vector<String>(); for (int i = 0; i < numAttributes; i++) { columnNames.add(md.getColumnLabel(i + 1)); } // Step through the tuples if (m_Debug) System.err.println("Creating instances..."); FastVector instances = new FastVector(); int rowCount = 0; while (rs.next()) { if (rowCount % 100 == 0) { if (m_Debug) { System.err.print("read " + rowCount + " instances \r"); System.err.flush(); } } double[] vals = new double[numAttributes]; for (int i = 1; i <= numAttributes; i++) { /*switch (md.getColumnType(i)) { case Types.CHAR: case Types.VARCHAR: case Types.LONGVARCHAR: case Types.BINARY: case Types.VARBINARY: case Types.LONGVARBINARY:*/ switch (translateDBColumnType(md.getColumnTypeName(i))) { case STRING: String str = rs.getString(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { Double index = (Double) nominalIndexes[i - 1].get(str); if (index == null) { index = new Double(nominalStrings[i - 1].size()); nominalIndexes[i - 1].put(str, index); nominalStrings[i - 1].addElement(str); } vals[i - 1] = index.doubleValue(); } break; case TEXT: String txt = rs.getString(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { Double index = (Double) nominalIndexes[i - 1].get(txt); if (index == null) { // Need to add one because first value in // string attribute is dummy value. index = new Double(nominalStrings[i - 1].size()) + 1; nominalIndexes[i - 1].put(txt, index); nominalStrings[i - 1].addElement(txt); } vals[i - 1] = index.doubleValue(); } break; case BOOL: boolean boo = rs.getBoolean(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (boo ? 1.0 : 0.0); } break; case DOUBLE: // BigDecimal bd = rs.getBigDecimal(i, 4); double dd = rs.getDouble(i); // Use the column precision instead of 4? if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { // newInst.setValue(i - 1, bd.doubleValue()); vals[i - 1] = dd; } break; case BYTE: byte by = rs.getByte(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) by; } break; case SHORT: short sh = rs.getShort(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) sh; } break; case INTEGER: int in = rs.getInt(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) in; } break; case LONG: long lo = rs.getLong(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) lo; } break; case FLOAT: float fl = rs.getFloat(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { vals[i - 1] = (double) fl; } break; case DATE: Date date = rs.getDate(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { // TODO: Do a value check here. vals[i - 1] = (double) date.getTime(); } break; case TIME: Time time = rs.getTime(i); if (rs.wasNull()) { vals[i - 1] = Instance.missingValue(); } else { // TODO: Do a value check here. vals[i - 1] = (double) time.getTime(); } break; default: vals[i - 1] = Instance.missingValue(); } } Instance newInst; if (m_CreateSparseData) { newInst = new SparseInstance(1.0, vals); } else { newInst = new Instance(1.0, vals); } instances.addElement(newInst); rowCount++; } //disconnectFromDatabase(); (perhaps other queries might be made) // Create the header and add the instances to the dataset if (m_Debug) System.err.println("Creating header..."); FastVector attribInfo = new FastVector(); for (int i = 0; i < numAttributes; i++) { /* Fix for databases that uppercase column names */ // String attribName = attributeCaseFix(md.getColumnName(i + 1)); String attribName = attributeCaseFix(columnNames.get(i)); switch (attributeTypes[i]) { case Attribute.NOMINAL: attribInfo.addElement(new Attribute(attribName, nominalStrings[i])); break; case Attribute.NUMERIC: attribInfo.addElement(new Attribute(attribName)); break; case Attribute.STRING: Attribute att = new Attribute(attribName, (FastVector) null); attribInfo.addElement(att); for (int n = 0; n < nominalStrings[i].size(); n++) { att.addStringValue((String) nominalStrings[i].elementAt(n)); } break; case Attribute.DATE: attribInfo.addElement(new Attribute(attribName, (String) null)); break; default: throw new Exception("Unknown attribute type"); } } Instances result = new Instances("QueryResult", attribInfo, instances.size()); for (int i = 0; i < instances.size(); i++) { result.add((Instance) instances.elementAt(i)); } close(rs); return result; }
From source file:qa.experiment.ProcessFeatureVector.java
public String trainAndPredict(String[] processNames, String question) throws Exception { FastVector fvWekaAttribute = generateWEKAFeatureVector(processNames); Instances trainingSet = new Instances("Rel", fvWekaAttribute, bowFeature.size() + 1); trainingSet.setClassIndex(bowFeature.size()); int cnt = 0;//from w w w.ja v a2 s . c o m for (int i = 0; i < arrProcessFeature.size(); i++) { String[] names = arrProcessFeature.get(i).getProcessName().split("\\|"); int sim = isNameFuzzyMatch(processNames, names); if (sim != -1) { // System.out.println("match " + arrProcessFeature.get(i).getProcessName()); ArrayList<String> featureVector = arrProcessFeature.get(i).getFeatureVectors(); for (int j = 0; j < featureVector.size(); j++) { Instance trainInstance = new Instance(bowFeature.size() + 1); String[] attrValues = featureVector.get(j).split("\t"); // System.out.println(trainInstance.numAttributes()); // System.out.println(fvWekaAttribute.size()); for (int k = 0; k < bowFeature.size(); k++) { trainInstance.setValue((Attribute) fvWekaAttribute.elementAt(k), Integer.parseInt(attrValues[k])); } trainInstance.setValue((Attribute) fvWekaAttribute.elementAt(bowFeature.size()), processNames[sim]); trainingSet.add(trainInstance); //System.out.println(cnt); cnt++; } } } Classifier cl = new NaiveBayes(); cl.buildClassifier(trainingSet); Instance inst = new Instance(bowFeature.size() + 1); //String[] tokenArr = tokens.toArray(new String[tokens.size()]); for (int j = 0; j < bowFeature.size(); j++) { List<String> tokens = slem.tokenize(question); String[] tokArr = tokens.toArray(new String[tokens.size()]); int freq = getFrequency(bowFeature.get(j), tokArr); inst.setValue((Attribute) fvWekaAttribute.elementAt(j), freq); } inst.setDataset(trainingSet); int idxMax = ArrUtil.getIdxMax(cl.distributionForInstance(inst)); return processNames[idxMax]; }
From source file:rdfsystem.data.DataMining.java
private static Instances transformData(RdfManager manager, boolean hasYear) throws Exception { Set<String> words = getAllWords(manager); FastVector binary = new FastVector(); binary.addElement("true"); binary.addElement("false"); FastVector attrs = new FastVector(); if (hasYear) { Attribute yearAttr = new Attribute("year"); attrs.addElement(yearAttr);//from w ww.ja v a 2 s . com } for (String word : words) { Attribute attr = new Attribute(word, binary); attrs.addElement(attr); } Instances ins = new Instances("paper", attrs, 0); for (Map.Entry<String, Paper> item : manager) { Paper p = item.getValue(); double[] row = new double[ins.numAttributes()]; int start = 0; if (hasYear) { row[0] = p.getYear(); start++; } for (int i = start; i < row.length; i++) row[i] = ins.attribute(i).indexOfValue("false"); for (String label : p.getLabel()) { int index = ins.attribute("label_" + label).index(); row[index] = ins.attribute(index).indexOfValue("true"); } for (Author au : p.getList()) { int index = ins.attribute("author_" + au.getId()).index(); row[index] = ins.attribute(index).indexOfValue("true"); } ins.add(new Instance(1.0, row)); } if (hasYear) { NumericToNominal f1 = new NumericToNominal(); f1.setInputFormat(ins); ins = Filter.useFilter(ins, f1); } return ins; }