List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:com.entopix.maui.filters.MauiFilter.java
License:Open Source License
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input instance * structure (any instances contained in the object are ignored - only the * structure is required).// w w w. j ava 2 s . com * @return true if the outputFormat may be collected immediately */ public boolean setInputFormat(Instances instanceInfo) throws MauiFilterException { if (instanceInfo.classIndex() >= 0) { throw new MauiFilterException("Don't know what do to if class index set!"); } if (!instanceInfo.attribute(keyphrasesAtt).isString() || !instanceInfo.attribute(documentAtt).isString()) { throw new MauiFilterException( "Keyphrase attribute and document attribute " + "need to be string attributes."); } try { phraseFilter = new MauiPhraseFilter(); int[] arr = new int[1]; arr[0] = documentAtt; phraseFilter.setAttributeIndicesArray(arr); phraseFilter.setInputFormat(instanceInfo); } catch (Exception e) { throw new MauiFilterException("Exception loading MauiPhraseFilter"); } try { if (vocabularyName.equals("none")) { numbersFilter = new NumbersFilter(); numbersFilter.setInputFormat(phraseFilter.getOutputFormat()); super.setInputFormat(numbersFilter.getOutputFormat()); } else { super.setInputFormat(phraseFilter.getOutputFormat()); } } catch (Exception e) { throw new MauiFilterException("Exception loading NumbersFilter"); } return false; }
From source file:com.entopix.maui.main.MauiModelBuilder.java
License:Open Source License
/** * Builds the model from the training data * @throws MauiFilterException /* w ww . j a v a 2 s . c o m*/ */ public MauiFilter buildModel(List<MauiDocument> documents) throws MauiFilterException { log.info("-- Building the model... "); FastVector atts = new FastVector(3); atts.addElement(new Attribute("filename", (FastVector) null)); atts.addElement(new Attribute("document", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); mauiFilter = new MauiFilter(); mauiFilter.setMaxPhraseLength(maxPhraseLength); mauiFilter.setMinPhraseLength(minPhraseLength); mauiFilter.setMinNumOccur(minNumOccur); mauiFilter.setStemmer(stemmer); mauiFilter.setDocumentLanguage(documentLanguage); mauiFilter.setVocabularyName(vocabularyName); mauiFilter.setVocabularyFormat(vocabularyFormat); mauiFilter.setStopwords(stopwords); mauiFilter.setVocabulary(vocabulary); if (classifier != null) { mauiFilter.setClassifier(classifier); } mauiFilter.setInputFormat(data); // set features configurations mauiFilter.setBasicFeatures(useBasicFeatures); mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature); mauiFilter.setFrequencyFeatures(useFrequencyFeatures); mauiFilter.setPositionsFeatures(usePositionsFeatures); mauiFilter.setLengthFeature(useLengthFeature); mauiFilter.setThesaurusFeatures(useThesaurusFeatures); mauiFilter.setWikipediaFeatures(useWikipediaFeatures, wikiFeatures); mauiFilter.setClassifier(classifier); if (!vocabularyName.equals("none")) { loadVocabulary(); mauiFilter.setVocabulary(vocabulary); } log.info("-- Adding documents as instances... "); for (MauiDocument document : documents) { double[] newInst = new double[3]; newInst[0] = data.attribute(0).addStringValue(document.getFileName()); // Adding the text and the topics for the document to the instance if (document.getTextContent().length() > 0) { newInst[1] = data.attribute(1).addStringValue(document.getTextContent()); } else { newInst[1] = Instance.missingValue(); } if (document.getTopicsString().length() > 0) { newInst[2] = data.attribute(2).addStringValue(document.getTopicsString()); } else { newInst[2] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); mauiFilter.input(data.instance(0)); data = data.stringFreeStructure(); } log.info("-- Building the model... "); mauiFilter.batchFinished(); while ((mauiFilter.output()) != null) { } return mauiFilter; }
From source file:com.github.polarisation.kea.main.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//from w w w .j a v a2 s . c o m */ public void extractKeyphrases(Hashtable stems) throws Exception { Vector stats = new Vector(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } System.err.println("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = (String) elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { System.err.println("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { System.err.println("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { System.err.println("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { System.err.println(topRankedInstances[i]); } } } if (numExtracted > 0) { if (m_debug) { System.err.println("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); System.err.println("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); System.err.println("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:com.openkm.kea.filter.KEAFilter.java
License:Open Source License
/** * Sets the format of the input instances. * * @param instanceInfo an Instances object containing the input * instance structure (any instances contained in the object are * ignored - only the structure is required). * @return true if the outputFormat may be collected immediately *//*from w w w . jav a2 s.c o m*/ public boolean setInputFormat(Instances instanceInfo) throws Exception { if (instanceInfo.classIndex() >= 0) { throw new Exception("Don't know what do to if class index set!"); } if (!instanceInfo.attribute(m_KeyphrasesAtt).isString() || !instanceInfo.attribute(m_DocumentAtt).isString()) { throw new Exception("Keyphrase attribute and document attribute " + "need to be string attributes."); } m_PunctFilter = new KEAPhraseFilter(); int[] arr = new int[1]; arr[0] = m_DocumentAtt; m_PunctFilter.setAttributeIndicesArray(arr); m_PunctFilter.setInputFormat(instanceInfo); m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods()); if (m_vocabulary.equals("none")) { m_NumbersFilter = new NumbersFilter(); m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat()); super.setInputFormat(m_NumbersFilter.getOutputFormat()); } else { super.setInputFormat(m_PunctFilter.getOutputFormat()); } return false; }
From source file:com.openkm.kea.metadata.SubjectExtractor.java
License:Open Source License
/** * extractSuggestedSubjects/*from www .j a v a 2 s . c om*/ * * @param documentText * @return */ public List<String> extractSuggestedSubjects(String documentText) { Date start, stop; start = new Date(); List<String> subjects = new ArrayList<String>(); // no idea what this is .... FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0); try { // this is the exrtraction process part - not too well understood yet // "unkowndatastructure" is called instances in original KEA code double[] unknownStructure = new double[2]; unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText); unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none unknownDataStructure.add(new Instance(1.0, unknownStructure)); filter.input(unknownDataStructure.instance(0)); unknownDataStructure.stringFreeStructure(); //??**&%%!!!?? // this is getting the results out - better understood Instance[] rankedSubjects = new Instance[this.subjectNumLimit]; Instance subject; while ((subject = filter.output()) != null) { int index = (int) subject.value(filter.getRankIndex()) - 1; if (index < subjectNumLimit) { rankedSubjects[index] = subject; } } for (int i = 0; i < subjectNumLimit; i++) { if (rankedSubjects[i] != null) { subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex())); } } } catch (Exception e) { log.error("problem in subject extraction: ", e); } finally { stop = new Date(); long time = (stop.getTime() - start.getTime()); log.info("Subject extraction completed in " + time + "ms"); } return subjects; }
From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java
License:Open Source License
/** * Builds the model from the files//from w w w.j a v a 2 s. co m */ public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception { Vector<Double> stats = new Vector<Double>(); // Check whether there is actually any data // = if there any files in the directory if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } m_KEAFilter.setNumPhrases(m_numPhrases); m_KEAFilter.setVocabulary(m_vocabulary); m_KEAFilter.setVocabularyFormat(m_vocabularyFormat); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setStemmer(m_Stemmer); m_KEAFilter.setStopwords(m_Stopwords); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords); } FastVector atts = new FastVector(3); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); atts.addElement(new Attribute("filename", (String) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); if (m_KEAFilter.m_Dictionary == null) { buildGlobalDictionaries(stems); } log.info("-- Extracting Keyphrases... "); // Extract keyphrases Enumeration<String> elem = stems.keys(); // Enumeration over all files in the directory (now in the hash): while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("Can't read document " + str + ".txt"); } newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; // keyStr = keyphrases in the str.key file // Kea assumes, that these keyphrases were assigned by the // author // and evaluates extracted keyphrases againse these while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { if (m_debug) { log.debug("No existing keyphrases for stem " + str + "."); } newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); if (m_debug) { log.debug("-- Document: " + str); } Instance[] topRankedInstances = new Instance[m_numPhrases]; Instance inst; // Iterating over all extracted keyphrases (inst) while ((inst = m_KEAFilter.output()) != null) { int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1; if (index < m_numPhrases) { topRankedInstances[index] = inst; } } if (m_debug) { log.debug("-- Keyphrases and feature values:"); } FileOutputStream out = null; PrintWriter printer = null; File key = new File(m_dirName + "/" + str + ".key"); if (!key.exists()) { out = new FileOutputStream(m_dirName + "/" + str + ".key"); if (!m_encoding.equals("default")) { printer = new PrintWriter(new OutputStreamWriter(out, m_encoding)); } else { printer = new PrintWriter(out); } } double numExtracted = 0, numCorrect = 0; for (int i = 0; i < m_numPhrases; i++) { if (topRankedInstances[i] != null) { // My addition: to exclude low ranking phrases double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()); if (rank >= 0.00) { if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) { numExtracted += 1.0; } if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) { numCorrect += 1.0; } if (printer != null) { printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex())); if (m_AdditionalInfo) { printer.print("\t"); printer.print( topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex())); printer.print("\t"); printer.print(Utils.doubleToString( topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4)); } printer.println(); } if (m_debug) { log.debug("" + topRankedInstances[i]); } } } } if (numExtracted > 0) { if (m_debug) { log.debug("-- " + numCorrect + " correct"); } stats.addElement(new Double(numCorrect)); } if (printer != null) { printer.flush(); printer.close(); out.close(); } } double[] st = new double[stats.size()]; for (int i = 0; i < stats.size(); i++) { st[i] = ((Double) stats.elementAt(i)).doubleValue(); } double avg = Utils.mean(st); double stdDev = Math.sqrt(Utils.variance(st)); log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2)); log.info("Based on " + stats.size() + " documents"); // m_KEAFilter.batchFinished(); }
From source file:com.openkm.kea.modelcreator.KEAModelBuilder.java
License:Open Source License
/** * Builds the model from the files/*w ww . ja v a2s .c o m*/ */ public void buildModel(Hashtable<String, Double> stems, Stopwords stopwords) throws Exception { // Check whether there is actually any data if (stems.size() == 0) { throw new Exception("Couldn't find any data!"); } FastVector atts = new FastVector(2); atts.addElement(new Attribute("doc", (FastVector) null)); atts.addElement(new Attribute("keyphrases", (FastVector) null)); Instances data = new Instances("keyphrase_training_data", atts, 0); // Build model m_KEAFilter = new KEAFilter(stopwords); m_KEAFilter.setDebug(m_debug); m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods()); m_KEAFilter.setKFused(getUseKFrequency()); m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength()); m_KEAFilter.setMinPhraseLength(getMinPhraseLength()); m_KEAFilter.setMinNumOccur(getMinNumOccur()); m_KEAFilter.setStemmer(getStemmer()); m_KEAFilter.setDocumentLanguage(getDocumentLanguage()); m_KEAFilter.setVocabulary(getVocabulary()); m_KEAFilter.setVocabularyFormat(getVocabularyFormat()); m_KEAFilter.setStopwords(getStopwords()); m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns()); m_KEAFilter.setInputFormat(data); if (getVocabulary().equals("none")) { m_KEAFilter.m_NODEfeature = false; } else { m_KEAFilter.loadThesaurus(getStemmer(), getStopwords()); } m_KEAFilter.setNumFeature(); log.info("-- Reading the Documents... "); Enumeration<String> elem = stems.keys(); while (elem.hasMoreElements()) { String str = elem.nextElement(); double[] newInst = new double[2]; try { File txt = new File(m_dirName + "/" + str + ".txt"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(txt), m_encoding); } else { is = new InputStreamReader(new FileInputStream(txt)); } StringBuffer txtStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { txtStr.append((char) c); } is.close(); newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString()); } catch (Exception e) { log.error("Can't find document for stem " + str + "."); newInst[0] = Instance.missingValue(); } try { File key = new File(m_dirName + "/" + str + ".key"); InputStreamReader is; if (!m_encoding.equals("default")) { is = new InputStreamReader(new FileInputStream(key), m_encoding); } else { is = new InputStreamReader(new FileInputStream(key)); } StringBuffer keyStr = new StringBuffer(); int c; while ((c = is.read()) != -1) { keyStr.append((char) c); } newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString()); } catch (Exception e) { log.error("Can't find keyphrases for stem " + str + "."); newInst[1] = Instance.missingValue(); } data.add(new Instance(1.0, newInst)); m_KEAFilter.input(data.instance(0)); data = data.stringFreeStructure(); } m_KEAFilter.batchFinished(); while ((m_KEAFilter.output()) != null) { } ; }
From source file:com.rapidminer.tools.WekaTools.java
License:Open Source License
/** * Creates a RapidMiner example set from Weka instances. Only a label can be used * as special attributes, other types of special attributes are not * supported. If <code>attributeNamePrefix</code> is not null, the given * string prefix plus a number is used as attribute names. *//*from w ww. ja v a2s . c om*/ public static ExampleSet toRapidMinerExampleSet(Instances instances, String attributeNamePrefix, int datamanagement) { int classIndex = instances.classIndex(); // create example table // 1. Extract attributes List<Attribute> attributes = new ArrayList<Attribute>(); int number = 1; // use for attribute names for (int i = 0; i < instances.numAttributes(); i++) { weka.core.Attribute wekaAttribute = instances.attribute(i); int rapidMinerAttributeValueType = Ontology.REAL; if (wekaAttribute.isNominal()) rapidMinerAttributeValueType = Ontology.NOMINAL; else if (wekaAttribute.isString()) rapidMinerAttributeValueType = Ontology.STRING; Attribute attribute = AttributeFactory.createAttribute(wekaAttribute.name(), rapidMinerAttributeValueType); if ((i != classIndex) && (attributeNamePrefix != null) && (attributeNamePrefix.length() > 0)) { attribute.setName(attributeNamePrefix + "_" + (number++)); } if (wekaAttribute.isNominal()) { for (int a = 0; a < wekaAttribute.numValues(); a++) { String nominalValue = wekaAttribute.value(a); attribute.getMapping().mapString(nominalValue); } } attributes.add(attribute); } Attribute label = null; if (classIndex >= 0) { label = attributes.get(classIndex); label.setName("label"); } // 2. Guarantee alphabetical mapping to numbers for (int j = 0; j < attributes.size(); j++) { Attribute attribute = attributes.get(j); if (attribute.isNominal()) attribute.getMapping().sortMappings(); } // 3. Read data MemoryExampleTable table = new MemoryExampleTable(attributes); DataRowFactory factory = new DataRowFactory(datamanagement, '.'); // create data List<DataRow> dataList = new LinkedList<DataRow>(); int numberOfRapidMinerAttributes = instances.numAttributes(); for (int i = 0; i < instances.numInstances(); i++) { Instance instance = instances.instance(i); DataRow dataRow = factory.create(numberOfRapidMinerAttributes); for (int a = 0; a < instances.numAttributes(); a++) { Attribute attribute = table.getAttribute(a); double wekaValue = instance.value(a); if (attribute.isNominal()) { String nominalValue = instances.attribute(a).value((int) wekaValue); dataRow.set(attribute, attribute.getMapping().mapString(nominalValue)); } else { dataRow.set(attribute, wekaValue); } } dataRow.trim(); dataList.add(dataRow); } // handle label extra table.readExamples(new ListDataRowReader(dataList.iterator())); // create and return example set return table.createExampleSet(label); }
From source file:com.relationalcloud.main.ExplanationSingleAttribute.java
License:Open Source License
/** * @param args/*from www .ja v a2 s .com*/ */ @Deprecated public static void main(String[] args) { Properties ini = new Properties(); try { ini.load(new FileInputStream(System.getProperty("prop"))); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // loading properties from file String schemaname = ini.getProperty("schemaname"); String partitioningMethod = ini.getProperty("partitioningMethod"); String pcol; if (partitioningMethod.equals("repGraph")) { System.out.println("Replication Graph: using replicated column"); pcol = ini.getProperty("replicatedPartitionCol"); } else { pcol = ini.getProperty("graphPartitionCol"); } String accessLogTable = ini.getProperty("accessLogTable"); String numb_trans_to_process = ini.getProperty("numb_trans_to_process"); String txnLogTable = ini.getProperty("txnLogTable"); String driver = ini.getProperty("driver"); String connection = ini.getProperty("conn"); String user = ini.getProperty("user"); String password = ini.getProperty("password"); System.out.println("Loading and processing " + schemaname + " traces..."); // Register jdbcDriver try { Class.forName(driver); } catch (ClassNotFoundException e) { e.printStackTrace(); } Connection conn; try { conn = DriverManager.getConnection(connection + schemaname, user, password); conn.setAutoCommit(true); Connection infschema_conn = DriverManager.getConnection(connection + "information_schema", user, password); Schema schema = SchemaLoader.loadSchemaFromDB(infschema_conn, schemaname); Statement stmt = conn.createStatement(); // NOTE: the paramenter numb_trans_to_process is used to limit // the number of transactions parsed to determine the which attributes // are common in the workload WHERE clauses. This can be a subset of the // overall set String sqlstring = "SELECT sqlstring FROM `" + txnLogTable + "` LIMIT " + numb_trans_to_process; ResultSet res = stmt.executeQuery(sqlstring); ExplanationWorkloadPrepocessor wa = new ExplanationWorkloadPrepocessor(schemaname, schema); double tstart = System.currentTimeMillis(); double i = 0; while (res.next()) { String sql = res.getString(1); // PARSE THE STATEMENT wa.processSql(sql); i++; } double tend = System.currentTimeMillis(); System.out.println("Processed " + i + " statements in " + (tend - tstart) + "ms average:" + (tend - tstart) / i + "ms per statement"); System.out.println("ANALISYS RESULTS:\n "); wa.printStatsByTableColumn(); for (String str : wa.getAllTableNames()) { if (str == null) continue; System.out.println("-------------------------------------------"); System.out.println("ANALYZING TABLE IN USED IN THE TRANSACTION TRACE " + str); for (SimpleCount sc : wa.getFeatures(str)) { ArrayList<Double> a0 = new ArrayList<Double>(); ArrayList<Double> a1 = new ArrayList<Double>(); sqlstring = "SELECT s." + sc.colname + ", g." + pcol + " FROM `" + accessLogTable + "` g, relcloud_" + str + " s WHERE tableid = \"" + str + "\" AND s.relcloud_id = g.tupleid"; // System.out.println(sqlstring); res = stmt.executeQuery(sqlstring); while (res.next()) { Object o1 = res.getObject(1); Object o2 = res.getObject(2); if (o1 != null && o2 != null) { a0.add(new Double(o1.hashCode())); a1.add(new Double(o2.hashCode())); } } if (a0.size() >= 1) { double[] d0 = new double[a0.size()]; double[] d1 = new double[a1.size()]; boolean unary = true; for (int j = 0; j < a0.size(); j++) { d0[j] = a0.get(j).doubleValue(); d1[j] = a1.get(j).doubleValue(); if (j > 0 && d1[j - 1] != d1[j]) unary = false; } if (unary) { System.out.println("EASY CASE: " + str + " is not partitioned and is stored in partition: " + d1[0]); } else { double correlation = PearsonCorrelation.getPearsonCorrelation(d0, d1); correlationThreshold = Double.parseDouble(ini.getProperty("correlationThreshold")); // if the correlation is high enough proceed to use decision // trees. if (Math.abs(correlation) > correlationThreshold) { System.out.println("Testing " + str + "." + sc.colname + ", " + pcol + " correlation: " + correlation + " (HIGH)"); try { // InstanceQuery query; // query = new InstanceQuery(); // query.setUsername("bbb"); // query.setPassword("qwer"); // query.connectToDatabase(); // Instances data = query.retrieveInstances(sqlstring); res.beforeFirst(); Instances data = WekaHelper.retrieveInstanceFromResultSet(res); // set the last column to be the classIndex... is this // correct? data.setClassIndex(data.numAttributes() - 1); Instances newData; if (data.attribute(data.numAttributes() - 1).type() == Attribute.NUMERIC) { NumericToNominal ntn = new NumericToNominal(); String[] options = new String[2]; options[0] = "-R"; // "range" options[1] = "2"; // first attribute ntn.setOptions(options); // set options ntn.setInputFormat(data); // inform filter about dataset // **AFTER** setting options newData = Filter.useFilter(data, ntn); // apply fil } else { StringToNominal ntn = new StringToNominal(); String[] options = new String[2]; options[0] = "-R"; // "range" options[1] = "2"; // first attribute ntn.setOptions(options); // set options ntn.setInputFormat(data); // inform filter about dataset // **AFTER** setting options newData = Filter.useFilter(data, ntn); // apply fil } String[] options = new String[1]; options[0] = "-P"; J48 tree = new J48(); // new instance of tree tree.setOptions(options); // set the options if (!tree.getCapabilities().test(newData)) { System.err.println("ERROR the FOLLOWING DATA CANNOT BE PROCESED:" + newData.toSummaryString()); System.err.println("QUERY WAS:" + sqlstring); } else { long treeTstart = System.currentTimeMillis(); tree.buildClassifier(newData); // build classifier long treeTend = System.currentTimeMillis(); System.out.println("CLASSIFICATION CONFIDENCE: " + tree.getConfidenceFactor() + "\n TREE BUILDING TIME: " + (treeTend - treeTstart) + "ms \n" + tree.toString()); System.out.println("TREE:" + tree.prefix()); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { System.out.println("Testing " + str + "." + sc.colname + ", " + pcol + " correlation: " + correlation + " (LOW)"); } } } } } } catch (SQLException e) { e.printStackTrace(); } }
From source file:com.relationalcloud.misc.JustifyAgnosticPartitioning.java
License:Open Source License
/** * @param args/*from w w w .jav a 2 s . c o m*/ */ public static void main(String[] args) { Properties ini = new Properties(); try { ini.load(new FileInputStream(System.getProperty("prop"))); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } // Register jdbcDriver try { Class.forName(ini.getProperty("driver")); } catch (ClassNotFoundException e) { e.printStackTrace(); } // READ FROM MYSQL THE TPCC TRANSACTION LOG, PARSE EACH STATEMENT AND TEST // VARIOUS PARSER FUNCTIONALITIES System.out.println("Loading and processing TPCC traces..."); Connection conn; try { String schemaname = ini.getProperty("schema"); String connection = ini.getProperty("conn"); String user = ini.getProperty("user"); String password = ini.getProperty("password"); conn = DriverManager.getConnection(connection + schemaname, user, password); Connection infschema_conn = DriverManager.getConnection(connection + "information_schema", user, password); Schema schema = SchemaLoader.loadSchemaFromDB(infschema_conn, schemaname); ExplanationWorkloadPrepocessor wa = new ExplanationWorkloadPrepocessor(schemaname, schema); conn.setAutoCommit(true); Statement stmt = conn.createStatement(); String txnLogTable = ini.getProperty("txnLogTable"); String sqlstring = "SELECT sqlstring FROM `" + txnLogTable + "`"; ResultSet res = stmt.executeQuery(sqlstring); double tstart = System.currentTimeMillis(); double i = 0; while (res.next()) { String sql = res.getString(1); // PARSE THE STATEMENT wa.processSql(sql); // System.out.println("SQL: " +sql); i++; } double tend = System.currentTimeMillis(); String accessLogTable = ini.getProperty("accessLogTable"); System.out.println("Processed " + i + " statements in " + (tend - tstart) + "ms average:" + (tend - tstart) / i + "ms per statement"); for (String str : wa.getAllTableNames()) { System.out.println("-------------------------------------------"); System.out.println("ANALYZING TABLE " + str); for (SimpleCount sc : wa.getFeatures(str)) { ArrayList<Double> a0 = new ArrayList<Double>(); ArrayList<Double> a1 = new ArrayList<Double>(); sqlstring = "SELECT s." + sc.colname + ", g.partition FROM `" + accessLogTable + "` g, " + str + " s WHERE tableid = \"" + str + "\" AND s.id = g.id"; System.out.println(sqlstring); res = stmt.executeQuery(sqlstring); while (res.next()) { a0.add(new Double(res.getObject(1).hashCode())); a1.add(new Double(res.getObject(2).hashCode())); } if (a0.size() >= 1) { double[] d0 = new double[a0.size()]; double[] d1 = new double[a1.size()]; boolean unary = true; for (int j = 0; j < a0.size(); j++) { d0[j] = a0.get(j).doubleValue(); d1[j] = a1.get(j).doubleValue(); if (j > 0 && d1[j - 1] != d1[j]) unary = false; } if (unary) { System.out.println("EASY CASE: " + str + " is not partitioned and is stored in partition: " + d1[0]); } else { double correlation = PearsonCorrelation.getPearsonCorrelation(d0, d1); correlationThreshold = Double.parseDouble(ini.getProperty("correlationThreshold")); // if the correlation is high enough proceed to use decision // trees. if (Math.abs(correlation) > correlationThreshold) { System.out.println("Testing " + str + "." + sc.colname + ", g.partition correlation: " + correlation + " (HIGH)"); try { // InstanceQuery query; // query = new InstanceQuery(); // query.setUsername("bbb"); // query.setPassword("qwer"); // query.connectToDatabase(); // Instances data = query.retrieveInstances(sqlstring); res.beforeFirst(); Instances data = retrieveInstanceFromResultSet(res); // set the last column to be the classIndex... is this // correct? data.setClassIndex(data.numAttributes() - 1); Instances newData; if (data.attribute(data.numAttributes() - 1).type() == Attribute.NUMERIC) { NumericToNominal ntn = new NumericToNominal(); String[] options = new String[2]; options[0] = "-R"; // "range" options[1] = "2"; // first attribute ntn.setOptions(options); // set options ntn.setInputFormat(data); // inform filter about dataset // **AFTER** setting options newData = Filter.useFilter(data, ntn); // apply fil } else { StringToNominal ntn = new StringToNominal(); String[] options = new String[2]; options[0] = "-R"; // "range" options[1] = "2"; // first attribute ntn.setOptions(options); // set options ntn.setInputFormat(data); // inform filter about dataset // **AFTER** setting options newData = Filter.useFilter(data, ntn); // apply fil } String[] options = new String[1]; options[0] = "-P"; J48 tree = new J48(); // new instance of tree tree.setOptions(options); // set the options if (!tree.getCapabilities().test(newData)) { System.err.println("ERROR the FOLLOWING DATA CANNOT BE PROCESED:" + newData.toSummaryString()); System.err.println("QUERY WAS:" + sqlstring); } else { tree.buildClassifier(newData); // build classifier } System.out.println("CLASSIFICATION CONFIDENCE: " + tree.getConfidenceFactor() + "\n " + tree.toString()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } else { System.out.println("Testing " + str + "." + sc.colname + ", g.partition correlation: " + correlation + " (LOW)"); } } } } } } catch (SQLException e) { e.printStackTrace(); } }