Example usage for weka.core Instances attribute

Introduction

In this page you can find the example usage for weka.core Instances attribute.

Prototype

publicAttribute attribute(String name)

Source Link

Document

Returns an attribute given its name.

Usage

From source file:com.entopix.maui.filters.MauiFilter.java

License:Open Source License

/**
 * Sets the format of the input instances.
 *
 * @param instanceInfo an Instances object containing the input instance
 * structure (any instances contained in the object are ignored - only the
 * structure is required).//  w  w w. j  ava 2 s .  com
 * @return true if the outputFormat may be collected immediately
 */
public boolean setInputFormat(Instances instanceInfo) throws MauiFilterException {

    if (instanceInfo.classIndex() >= 0) {
        throw new MauiFilterException("Don't know what do to if class index set!");
    }

    if (!instanceInfo.attribute(keyphrasesAtt).isString() || !instanceInfo.attribute(documentAtt).isString()) {
        throw new MauiFilterException(
                "Keyphrase attribute and document attribute " + "need to be string attributes.");
    }

    try {
        phraseFilter = new MauiPhraseFilter();
        int[] arr = new int[1];
        arr[0] = documentAtt;
        phraseFilter.setAttributeIndicesArray(arr);
        phraseFilter.setInputFormat(instanceInfo);
    } catch (Exception e) {
        throw new MauiFilterException("Exception loading MauiPhraseFilter");
    }

    try {
        if (vocabularyName.equals("none")) {
            numbersFilter = new NumbersFilter();
            numbersFilter.setInputFormat(phraseFilter.getOutputFormat());
            super.setInputFormat(numbersFilter.getOutputFormat());
        } else {
            super.setInputFormat(phraseFilter.getOutputFormat());
        }
    } catch (Exception e) {
        throw new MauiFilterException("Exception loading NumbersFilter");
    }

    return false;

}

From source file:com.entopix.maui.main.MauiModelBuilder.java

License:Open Source License

/**
 * Builds the model from the training data
 * @throws MauiFilterException /* w  ww . j  a  v  a 2  s  .  c o m*/
 */
public MauiFilter buildModel(List<MauiDocument> documents) throws MauiFilterException {

    log.info("-- Building the model... ");

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("filename", (FastVector) null));
    atts.addElement(new Attribute("document", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    mauiFilter = new MauiFilter();
    mauiFilter.setMaxPhraseLength(maxPhraseLength);
    mauiFilter.setMinPhraseLength(minPhraseLength);
    mauiFilter.setMinNumOccur(minNumOccur);
    mauiFilter.setStemmer(stemmer);
    mauiFilter.setDocumentLanguage(documentLanguage);
    mauiFilter.setVocabularyName(vocabularyName);
    mauiFilter.setVocabularyFormat(vocabularyFormat);
    mauiFilter.setStopwords(stopwords);
    mauiFilter.setVocabulary(vocabulary);

    if (classifier != null) {
        mauiFilter.setClassifier(classifier);
    }

    mauiFilter.setInputFormat(data);

    // set features configurations
    mauiFilter.setBasicFeatures(useBasicFeatures);
    mauiFilter.setKeyphrasenessFeature(useKeyphrasenessFeature);
    mauiFilter.setFrequencyFeatures(useFrequencyFeatures);
    mauiFilter.setPositionsFeatures(usePositionsFeatures);
    mauiFilter.setLengthFeature(useLengthFeature);
    mauiFilter.setThesaurusFeatures(useThesaurusFeatures);
    mauiFilter.setWikipediaFeatures(useWikipediaFeatures, wikiFeatures);

    mauiFilter.setClassifier(classifier);

    if (!vocabularyName.equals("none")) {
        loadVocabulary();
        mauiFilter.setVocabulary(vocabulary);
    }

    log.info("-- Adding documents as instances... ");

    for (MauiDocument document : documents) {

        double[] newInst = new double[3];
        newInst[0] = data.attribute(0).addStringValue(document.getFileName());

        // Adding the text and the topics for the document to the instance
        if (document.getTextContent().length() > 0) {
            newInst[1] = data.attribute(1).addStringValue(document.getTextContent());
        } else {
            newInst[1] = Instance.missingValue();
        }

        if (document.getTopicsString().length() > 0) {
            newInst[2] = data.attribute(2).addStringValue(document.getTopicsString());
        } else {
            newInst[2] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        mauiFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    log.info("-- Building the model... ");

    mauiFilter.batchFinished();

    while ((mauiFilter.output()) != null) {
    }

    return mauiFilter;

}

From source file:com.github.polarisation.kea.main.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//from   w  w  w  .j a  v a2 s .  c  o  m
 */
public void extractKeyphrases(Hashtable stems) throws Exception {

    Vector stats = new Vector();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    System.err.println("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = (String) elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                System.err.println("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                System.err.println("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            System.err.println("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            System.err.println("-- Keyphrases and feature values:");
        }
        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }
        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                    numExtracted += 1.0;
                }
                if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                    numCorrect += 1.0;
                }
                if (printer != null) {
                    printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                    if (m_AdditionalInfo) {
                        printer.print("\t");
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                        printer.print("\t");
                        printer.print(Utils.doubleToString(
                                topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                    }
                    printer.println();
                }
                if (m_debug) {
                    System.err.println(topRankedInstances[i]);
                }
            }
        }
        if (numExtracted > 0) {
            if (m_debug) {
                System.err.println("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }
        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }
    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    System.err.println("Avg. number of matching keyphrases compared to existing ones : "
            + Utils.doubleToString(avg, 2) + " +/- " + Utils.doubleToString(stdDev, 2));
    System.err.println("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:com.openkm.kea.filter.KEAFilter.java

License:Open Source License

/**
 * Sets the format of the input instances.
 *
 * @param instanceInfo an Instances object containing the input
 * instance structure (any instances contained in the object are
 * ignored - only the structure is required).
 * @return true if the outputFormat may be collected immediately 
 *//*from  w  w  w  .  jav  a2 s.c o m*/
public boolean setInputFormat(Instances instanceInfo) throws Exception {

    if (instanceInfo.classIndex() >= 0) {
        throw new Exception("Don't know what do to if class index set!");
    }
    if (!instanceInfo.attribute(m_KeyphrasesAtt).isString()
            || !instanceInfo.attribute(m_DocumentAtt).isString()) {
        throw new Exception("Keyphrase attribute and document attribute " + "need to be string attributes.");
    }
    m_PunctFilter = new KEAPhraseFilter();
    int[] arr = new int[1];
    arr[0] = m_DocumentAtt;
    m_PunctFilter.setAttributeIndicesArray(arr);
    m_PunctFilter.setInputFormat(instanceInfo);
    m_PunctFilter.setDisallowInternalPeriods(getDisallowInternalPeriods());

    if (m_vocabulary.equals("none")) {
        m_NumbersFilter = new NumbersFilter();
        m_NumbersFilter.setInputFormat(m_PunctFilter.getOutputFormat());
        super.setInputFormat(m_NumbersFilter.getOutputFormat());
    } else {
        super.setInputFormat(m_PunctFilter.getOutputFormat());
    }

    return false;

}

From source file:com.openkm.kea.metadata.SubjectExtractor.java

License:Open Source License

/**
 * extractSuggestedSubjects/*from  www .j a v  a  2  s  .  c  om*/
 * 
 * @param documentText
 * @return
 */
public List<String> extractSuggestedSubjects(String documentText) {

    Date start, stop;

    start = new Date();
    List<String> subjects = new ArrayList<String>();
    // no idea what this is ....
    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances unknownDataStructure = new Instances("keyphrase_training_data", atts, 0);

    try {
        // this is the exrtraction process part - not too well understood yet
        // "unkowndatastructure" is called instances in original KEA code
        double[] unknownStructure = new double[2];
        unknownStructure[0] = (double) unknownDataStructure.attribute(0).addStringValue(documentText);
        unknownStructure[1] = Instance.missingValue(); // this part used for existing subjects - we have none
        unknownDataStructure.add(new Instance(1.0, unknownStructure));
        filter.input(unknownDataStructure.instance(0));
        unknownDataStructure.stringFreeStructure(); //??**&%%!!!??

        // this is getting the results out - better understood
        Instance[] rankedSubjects = new Instance[this.subjectNumLimit];
        Instance subject;
        while ((subject = filter.output()) != null) {
            int index = (int) subject.value(filter.getRankIndex()) - 1;
            if (index < subjectNumLimit) {
                rankedSubjects[index] = subject;
            }
        }
        for (int i = 0; i < subjectNumLimit; i++) {
            if (rankedSubjects[i] != null) {
                subjects.add(rankedSubjects[i].stringValue(filter.getUnstemmedPhraseIndex()));
            }
        }

    } catch (Exception e) {
        log.error("problem in subject extraction: ", e);
    } finally {
        stop = new Date();
        long time = (stop.getTime() - start.getTime());
        log.info("Subject extraction completed in " + time + "ms");
    }

    return subjects;
}

From source file:com.openkm.kea.modelcreator.KEAKeyphraseExtractor.java

License:Open Source License

/**
 * Builds the model from the files//from  w  w  w.j  a v  a 2 s.  co  m
 */
public void extractKeyphrases(Hashtable<String, Double> stems) throws Exception {
    Vector<Double> stats = new Vector<Double>();

    // Check whether there is actually any data
    // = if there any files in the directory
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }
    m_KEAFilter.setNumPhrases(m_numPhrases);
    m_KEAFilter.setVocabulary(m_vocabulary);
    m_KEAFilter.setVocabularyFormat(m_vocabularyFormat);
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setStemmer(m_Stemmer);
    m_KEAFilter.setStopwords(m_Stopwords);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(m_Stemmer, m_Stopwords);
    }

    FastVector atts = new FastVector(3);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    atts.addElement(new Attribute("filename", (String) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    if (m_KEAFilter.m_Dictionary == null) {
        buildGlobalDictionaries(stems);
    }

    log.info("-- Extracting Keyphrases... ");
    // Extract keyphrases
    Enumeration<String> elem = stems.keys();
    // Enumeration over all files in the directory (now in the hash):
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }

            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());

        } catch (Exception e) {
            if (m_debug) {
                log.debug("Can't read document " + str + ".txt");
            }
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;

            // keyStr = keyphrases in the str.key file
            // Kea assumes, that these keyphrases were assigned by the
            // author
            // and evaluates extracted keyphrases againse these

            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }

            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            if (m_debug) {
                log.debug("No existing keyphrases for stem " + str + ".");
            }
            newInst[1] = Instance.missingValue();
        }

        data.add(new Instance(1.0, newInst));

        m_KEAFilter.input(data.instance(0));

        data = data.stringFreeStructure();
        if (m_debug) {
            log.debug("-- Document: " + str);
        }
        Instance[] topRankedInstances = new Instance[m_numPhrases];
        Instance inst;

        // Iterating over all extracted keyphrases (inst)
        while ((inst = m_KEAFilter.output()) != null) {

            int index = (int) inst.value(m_KEAFilter.getRankIndex()) - 1;

            if (index < m_numPhrases) {
                topRankedInstances[index] = inst;

            }
        }

        if (m_debug) {
            log.debug("-- Keyphrases and feature values:");
        }

        FileOutputStream out = null;
        PrintWriter printer = null;
        File key = new File(m_dirName + "/" + str + ".key");
        if (!key.exists()) {
            out = new FileOutputStream(m_dirName + "/" + str + ".key");
            if (!m_encoding.equals("default")) {
                printer = new PrintWriter(new OutputStreamWriter(out, m_encoding));

            } else {
                printer = new PrintWriter(out);
            }
        }

        double numExtracted = 0, numCorrect = 0;

        for (int i = 0; i < m_numPhrases; i++) {
            if (topRankedInstances[i] != null) {
                // My addition: to exclude low ranking phrases
                double rank = topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex());

                if (rank >= 0.00) {
                    if (!topRankedInstances[i].isMissing(topRankedInstances[i].numAttributes() - 1)) {
                        numExtracted += 1.0;
                    }
                    if ((int) topRankedInstances[i].value(topRankedInstances[i].numAttributes() - 1) == 1) {
                        numCorrect += 1.0;
                    }
                    if (printer != null) {
                        printer.print(topRankedInstances[i].stringValue(m_KEAFilter.getUnstemmedPhraseIndex()));

                        if (m_AdditionalInfo) {
                            printer.print("\t");
                            printer.print(
                                    topRankedInstances[i].stringValue(m_KEAFilter.getStemmedPhraseIndex()));
                            printer.print("\t");
                            printer.print(Utils.doubleToString(
                                    topRankedInstances[i].value(m_KEAFilter.getProbabilityIndex()), 4));
                        }
                        printer.println();
                    }
                    if (m_debug) {
                        log.debug("" + topRankedInstances[i]);
                    }
                }
            }
        }

        if (numExtracted > 0) {
            if (m_debug) {
                log.debug("-- " + numCorrect + " correct");
            }
            stats.addElement(new Double(numCorrect));
        }

        if (printer != null) {
            printer.flush();
            printer.close();
            out.close();
        }
    }

    double[] st = new double[stats.size()];
    for (int i = 0; i < stats.size(); i++) {
        st[i] = ((Double) stats.elementAt(i)).doubleValue();
    }
    double avg = Utils.mean(st);
    double stdDev = Math.sqrt(Utils.variance(st));

    log.info("Avg. number of matching keyphrases compared to existing ones : " + Utils.doubleToString(avg, 2)
            + " +/- " + Utils.doubleToString(stdDev, 2));
    log.info("Based on " + stats.size() + " documents");
    // m_KEAFilter.batchFinished();
}

From source file:com.openkm.kea.modelcreator.KEAModelBuilder.java

License:Open Source License

/**
 * Builds the model from the files/*w ww . ja v  a2s  .c  o  m*/
 */
public void buildModel(Hashtable<String, Double> stems, Stopwords stopwords) throws Exception {
    // Check whether there is actually any data
    if (stems.size() == 0) {
        throw new Exception("Couldn't find any data!");
    }

    FastVector atts = new FastVector(2);
    atts.addElement(new Attribute("doc", (FastVector) null));
    atts.addElement(new Attribute("keyphrases", (FastVector) null));
    Instances data = new Instances("keyphrase_training_data", atts, 0);

    // Build model
    m_KEAFilter = new KEAFilter(stopwords);

    m_KEAFilter.setDebug(m_debug);
    m_KEAFilter.setDisallowInternalPeriods(getDisallowIPeriods());
    m_KEAFilter.setKFused(getUseKFrequency());

    m_KEAFilter.setMaxPhraseLength(getMaxPhraseLength());
    m_KEAFilter.setMinPhraseLength(getMinPhraseLength());
    m_KEAFilter.setMinNumOccur(getMinNumOccur());
    m_KEAFilter.setStemmer(getStemmer());
    m_KEAFilter.setDocumentLanguage(getDocumentLanguage());
    m_KEAFilter.setVocabulary(getVocabulary());
    m_KEAFilter.setVocabularyFormat(getVocabularyFormat());
    m_KEAFilter.setStopwords(getStopwords());
    m_KEAFilter.setCheckForProperNouns(getCheckForProperNouns());
    m_KEAFilter.setInputFormat(data);

    if (getVocabulary().equals("none")) {
        m_KEAFilter.m_NODEfeature = false;
    } else {
        m_KEAFilter.loadThesaurus(getStemmer(), getStopwords());
    }
    m_KEAFilter.setNumFeature();

    log.info("-- Reading the Documents... ");

    Enumeration<String> elem = stems.keys();
    while (elem.hasMoreElements()) {
        String str = elem.nextElement();

        double[] newInst = new double[2];
        try {
            File txt = new File(m_dirName + "/" + str + ".txt");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(txt), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(txt));
            }
            StringBuffer txtStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                txtStr.append((char) c);
            }
            is.close();
            newInst[0] = (double) data.attribute(0).addStringValue(txtStr.toString());
        } catch (Exception e) {
            log.error("Can't find document for stem " + str + ".");
            newInst[0] = Instance.missingValue();
        }
        try {
            File key = new File(m_dirName + "/" + str + ".key");
            InputStreamReader is;
            if (!m_encoding.equals("default")) {
                is = new InputStreamReader(new FileInputStream(key), m_encoding);
            } else {
                is = new InputStreamReader(new FileInputStream(key));
            }
            StringBuffer keyStr = new StringBuffer();
            int c;
            while ((c = is.read()) != -1) {
                keyStr.append((char) c);
            }
            newInst[1] = (double) data.attribute(1).addStringValue(keyStr.toString());
        } catch (Exception e) {
            log.error("Can't find keyphrases for stem " + str + ".");
            newInst[1] = Instance.missingValue();
        }
        data.add(new Instance(1.0, newInst));
        m_KEAFilter.input(data.instance(0));
        data = data.stringFreeStructure();
    }
    m_KEAFilter.batchFinished();

    while ((m_KEAFilter.output()) != null) {
    }
    ;
}

From source file:com.rapidminer.tools.WekaTools.java

License:Open Source License

/**
 * Creates a RapidMiner example set from Weka instances. Only a label can be used
 * as special attributes, other types of special attributes are not
 * supported. If <code>attributeNamePrefix</code> is not null, the given
 * string prefix plus a number is used as attribute names.
 *//*from w  ww.  ja v  a2s . c om*/
public static ExampleSet toRapidMinerExampleSet(Instances instances, String attributeNamePrefix,
        int datamanagement) {
    int classIndex = instances.classIndex();

    // create example table

    // 1. Extract attributes
    List<Attribute> attributes = new ArrayList<Attribute>();
    int number = 1; // use for attribute names
    for (int i = 0; i < instances.numAttributes(); i++) {
        weka.core.Attribute wekaAttribute = instances.attribute(i);
        int rapidMinerAttributeValueType = Ontology.REAL;
        if (wekaAttribute.isNominal())
            rapidMinerAttributeValueType = Ontology.NOMINAL;
        else if (wekaAttribute.isString())
            rapidMinerAttributeValueType = Ontology.STRING;
        Attribute attribute = AttributeFactory.createAttribute(wekaAttribute.name(),
                rapidMinerAttributeValueType);
        if ((i != classIndex) && (attributeNamePrefix != null) && (attributeNamePrefix.length() > 0)) {
            attribute.setName(attributeNamePrefix + "_" + (number++));
        }
        if (wekaAttribute.isNominal()) {
            for (int a = 0; a < wekaAttribute.numValues(); a++) {
                String nominalValue = wekaAttribute.value(a);
                attribute.getMapping().mapString(nominalValue);
            }
        }
        attributes.add(attribute);
    }

    Attribute label = null;
    if (classIndex >= 0) {
        label = attributes.get(classIndex);
        label.setName("label");
    }

    // 2. Guarantee alphabetical mapping to numbers
    for (int j = 0; j < attributes.size(); j++) {
        Attribute attribute = attributes.get(j);
        if (attribute.isNominal())
            attribute.getMapping().sortMappings();
    }

    // 3. Read data
    MemoryExampleTable table = new MemoryExampleTable(attributes);
    DataRowFactory factory = new DataRowFactory(datamanagement, '.');
    // create data
    List<DataRow> dataList = new LinkedList<DataRow>();
    int numberOfRapidMinerAttributes = instances.numAttributes();
    for (int i = 0; i < instances.numInstances(); i++) {
        Instance instance = instances.instance(i);
        DataRow dataRow = factory.create(numberOfRapidMinerAttributes);
        for (int a = 0; a < instances.numAttributes(); a++) {
            Attribute attribute = table.getAttribute(a);
            double wekaValue = instance.value(a);
            if (attribute.isNominal()) {
                String nominalValue = instances.attribute(a).value((int) wekaValue);
                dataRow.set(attribute, attribute.getMapping().mapString(nominalValue));
            } else {
                dataRow.set(attribute, wekaValue);
            }
        }
        dataRow.trim();
        dataList.add(dataRow);
    }

    // handle label extra
    table.readExamples(new ListDataRowReader(dataList.iterator()));

    // create and return example set
    return table.createExampleSet(label);
}

From source file:com.relationalcloud.main.ExplanationSingleAttribute.java

License:Open Source License

/**
 * @param args/*from   www  .ja  v  a2  s .com*/
 */
@Deprecated
public static void main(String[] args) {

    Properties ini = new Properties();
    try {
        ini.load(new FileInputStream(System.getProperty("prop")));
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    // loading properties from file
    String schemaname = ini.getProperty("schemaname");

    String partitioningMethod = ini.getProperty("partitioningMethod");
    String pcol;
    if (partitioningMethod.equals("repGraph")) {
        System.out.println("Replication Graph: using replicated column");
        pcol = ini.getProperty("replicatedPartitionCol");
    } else {
        pcol = ini.getProperty("graphPartitionCol");
    }

    String accessLogTable = ini.getProperty("accessLogTable");
    String numb_trans_to_process = ini.getProperty("numb_trans_to_process");
    String txnLogTable = ini.getProperty("txnLogTable");
    String driver = ini.getProperty("driver");
    String connection = ini.getProperty("conn");
    String user = ini.getProperty("user");
    String password = ini.getProperty("password");

    System.out.println("Loading and processing " + schemaname + " traces...");

    // Register jdbcDriver
    try {
        Class.forName(driver);
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    }

    Connection conn;
    try {
        conn = DriverManager.getConnection(connection + schemaname, user, password);
        conn.setAutoCommit(true);

        Connection infschema_conn = DriverManager.getConnection(connection + "information_schema", user,
                password);

        Schema schema = SchemaLoader.loadSchemaFromDB(infschema_conn, schemaname);

        Statement stmt = conn.createStatement();

        // NOTE: the paramenter numb_trans_to_process is used to limit
        // the number of transactions parsed to determine the which attributes
        // are common in the workload WHERE clauses. This can be a subset of the
        // overall set

        String sqlstring = "SELECT sqlstring FROM `" + txnLogTable + "` LIMIT " + numb_trans_to_process;
        ResultSet res = stmt.executeQuery(sqlstring);

        ExplanationWorkloadPrepocessor wa = new ExplanationWorkloadPrepocessor(schemaname, schema);

        double tstart = System.currentTimeMillis();
        double i = 0;
        while (res.next()) {
            String sql = res.getString(1);
            // PARSE THE STATEMENT
            wa.processSql(sql);
            i++;
        }

        double tend = System.currentTimeMillis();

        System.out.println("Processed " + i + " statements in " + (tend - tstart) + "ms average:"
                + (tend - tstart) / i + "ms per statement");

        System.out.println("ANALISYS RESULTS:\n ");
        wa.printStatsByTableColumn();

        for (String str : wa.getAllTableNames()) {
            if (str == null)
                continue;
            System.out.println("-------------------------------------------");
            System.out.println("ANALYZING TABLE IN USED IN THE TRANSACTION TRACE " + str);
            for (SimpleCount sc : wa.getFeatures(str)) {

                ArrayList<Double> a0 = new ArrayList<Double>();
                ArrayList<Double> a1 = new ArrayList<Double>();

                sqlstring = "SELECT s." + sc.colname + ", g." + pcol + " FROM `" + accessLogTable
                        + "` g, relcloud_" + str + " s WHERE tableid = \"" + str
                        + "\" AND s.relcloud_id = g.tupleid";

                // System.out.println(sqlstring);
                res = stmt.executeQuery(sqlstring);

                while (res.next()) {
                    Object o1 = res.getObject(1);
                    Object o2 = res.getObject(2);
                    if (o1 != null && o2 != null) {
                        a0.add(new Double(o1.hashCode()));
                        a1.add(new Double(o2.hashCode()));
                    }
                }

                if (a0.size() >= 1) {
                    double[] d0 = new double[a0.size()];
                    double[] d1 = new double[a1.size()];

                    boolean unary = true;

                    for (int j = 0; j < a0.size(); j++) {
                        d0[j] = a0.get(j).doubleValue();
                        d1[j] = a1.get(j).doubleValue();
                        if (j > 0 && d1[j - 1] != d1[j])
                            unary = false;
                    }

                    if (unary) {
                        System.out.println("EASY CASE: " + str
                                + " is not partitioned and is stored in partition: " + d1[0]);
                    } else {

                        double correlation = PearsonCorrelation.getPearsonCorrelation(d0, d1);

                        correlationThreshold = Double.parseDouble(ini.getProperty("correlationThreshold"));

                        // if the correlation is high enough proceed to use decision
                        // trees.
                        if (Math.abs(correlation) > correlationThreshold) {
                            System.out.println("Testing " + str + "." + sc.colname + ", " + pcol
                                    + " correlation: " + correlation + " (HIGH)");

                            try {
                                // InstanceQuery query;
                                // query = new InstanceQuery();
                                // query.setUsername("bbb");
                                // query.setPassword("qwer");
                                // query.connectToDatabase();
                                // Instances data = query.retrieveInstances(sqlstring);
                                res.beforeFirst();

                                Instances data = WekaHelper.retrieveInstanceFromResultSet(res);
                                // set the last column to be the classIndex... is this
                                // correct?
                                data.setClassIndex(data.numAttributes() - 1);

                                Instances newData;

                                if (data.attribute(data.numAttributes() - 1).type() == Attribute.NUMERIC) {
                                    NumericToNominal ntn = new NumericToNominal();
                                    String[] options = new String[2];
                                    options[0] = "-R"; // "range"
                                    options[1] = "2"; // first attribute
                                    ntn.setOptions(options); // set options
                                    ntn.setInputFormat(data); // inform filter about dataset
                                    // **AFTER** setting options
                                    newData = Filter.useFilter(data, ntn); // apply fil

                                } else {
                                    StringToNominal ntn = new StringToNominal();
                                    String[] options = new String[2];
                                    options[0] = "-R"; // "range"
                                    options[1] = "2"; // first attribute
                                    ntn.setOptions(options); // set options
                                    ntn.setInputFormat(data); // inform filter about dataset
                                    // **AFTER** setting options
                                    newData = Filter.useFilter(data, ntn); // apply fil

                                }

                                String[] options = new String[1];
                                options[0] = "-P";
                                J48 tree = new J48(); // new instance of tree
                                tree.setOptions(options); // set the options

                                if (!tree.getCapabilities().test(newData)) {
                                    System.err.println("ERROR the FOLLOWING DATA CANNOT BE PROCESED:"
                                            + newData.toSummaryString());
                                    System.err.println("QUERY WAS:" + sqlstring);
                                } else {
                                    long treeTstart = System.currentTimeMillis();
                                    tree.buildClassifier(newData); // build classifier
                                    long treeTend = System.currentTimeMillis();
                                    System.out.println("CLASSIFICATION CONFIDENCE:  "
                                            + tree.getConfidenceFactor() + "\n TREE BUILDING TIME: "
                                            + (treeTend - treeTstart) + "ms \n" + tree.toString());
                                    System.out.println("TREE:" + tree.prefix());
                                }

                            } catch (Exception e) {
                                // TODO Auto-generated catch block
                                e.printStackTrace();
                            }

                        } else {
                            System.out.println("Testing " + str + "." + sc.colname + ", " + pcol
                                    + " correlation: " + correlation + " (LOW)");
                        }
                    }
                }
            }
        }

    } catch (SQLException e) {
        e.printStackTrace();
    }

}

From source file:com.relationalcloud.misc.JustifyAgnosticPartitioning.java

License:Open Source License

/**
 * @param args/*from   w w  w  .jav  a 2  s . c o m*/
 */
public static void main(String[] args) {

    Properties ini = new Properties();
    try {
        ini.load(new FileInputStream(System.getProperty("prop")));
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    } catch (IOException e) {
        e.printStackTrace();
    }

    // Register jdbcDriver
    try {
        Class.forName(ini.getProperty("driver"));
    } catch (ClassNotFoundException e) {
        e.printStackTrace();
    }

    // READ FROM MYSQL THE TPCC TRANSACTION LOG, PARSE EACH STATEMENT AND TEST
    // VARIOUS PARSER FUNCTIONALITIES
    System.out.println("Loading and processing TPCC traces...");

    Connection conn;
    try {

        String schemaname = ini.getProperty("schema");
        String connection = ini.getProperty("conn");
        String user = ini.getProperty("user");
        String password = ini.getProperty("password");
        conn = DriverManager.getConnection(connection + schemaname, user, password);

        Connection infschema_conn = DriverManager.getConnection(connection + "information_schema", user,
                password);

        Schema schema = SchemaLoader.loadSchemaFromDB(infschema_conn, schemaname);

        ExplanationWorkloadPrepocessor wa = new ExplanationWorkloadPrepocessor(schemaname, schema);

        conn.setAutoCommit(true);

        Statement stmt = conn.createStatement();

        String txnLogTable = ini.getProperty("txnLogTable");
        String sqlstring = "SELECT sqlstring FROM `" + txnLogTable + "`";
        ResultSet res = stmt.executeQuery(sqlstring);

        double tstart = System.currentTimeMillis();
        double i = 0;
        while (res.next()) {
            String sql = res.getString(1);
            // PARSE THE STATEMENT
            wa.processSql(sql);
            // System.out.println("SQL: " +sql);
            i++;
        }

        double tend = System.currentTimeMillis();

        String accessLogTable = ini.getProperty("accessLogTable");

        System.out.println("Processed " + i + " statements in " + (tend - tstart) + "ms average:"
                + (tend - tstart) / i + "ms per statement");
        for (String str : wa.getAllTableNames()) {

            System.out.println("-------------------------------------------");
            System.out.println("ANALYZING TABLE " + str);
            for (SimpleCount sc : wa.getFeatures(str)) {

                ArrayList<Double> a0 = new ArrayList<Double>();
                ArrayList<Double> a1 = new ArrayList<Double>();

                sqlstring = "SELECT s." + sc.colname + ", g.partition FROM `" + accessLogTable + "` g, " + str
                        + " s WHERE tableid = \"" + str + "\" AND s.id = g.id";
                System.out.println(sqlstring);
                res = stmt.executeQuery(sqlstring);

                while (res.next()) {
                    a0.add(new Double(res.getObject(1).hashCode()));
                    a1.add(new Double(res.getObject(2).hashCode()));
                }

                if (a0.size() >= 1) {
                    double[] d0 = new double[a0.size()];
                    double[] d1 = new double[a1.size()];

                    boolean unary = true;

                    for (int j = 0; j < a0.size(); j++) {
                        d0[j] = a0.get(j).doubleValue();
                        d1[j] = a1.get(j).doubleValue();
                        if (j > 0 && d1[j - 1] != d1[j])
                            unary = false;
                    }

                    if (unary) {
                        System.out.println("EASY CASE: " + str
                                + " is not partitioned and is stored in partition: " + d1[0]);
                    } else {

                        double correlation = PearsonCorrelation.getPearsonCorrelation(d0, d1);

                        correlationThreshold = Double.parseDouble(ini.getProperty("correlationThreshold"));

                        // if the correlation is high enough proceed to use decision
                        // trees.
                        if (Math.abs(correlation) > correlationThreshold) {
                            System.out.println("Testing " + str + "." + sc.colname
                                    + ", g.partition correlation: " + correlation + " (HIGH)");

                            try {
                                // InstanceQuery query;
                                // query = new InstanceQuery();
                                // query.setUsername("bbb");
                                // query.setPassword("qwer");
                                // query.connectToDatabase();
                                // Instances data = query.retrieveInstances(sqlstring);
                                res.beforeFirst();
                                Instances data = retrieveInstanceFromResultSet(res);
                                // set the last column to be the classIndex... is this
                                // correct?
                                data.setClassIndex(data.numAttributes() - 1);

                                Instances newData;

                                if (data.attribute(data.numAttributes() - 1).type() == Attribute.NUMERIC) {
                                    NumericToNominal ntn = new NumericToNominal();
                                    String[] options = new String[2];
                                    options[0] = "-R"; // "range"
                                    options[1] = "2"; // first attribute
                                    ntn.setOptions(options); // set options
                                    ntn.setInputFormat(data); // inform filter about dataset
                                    // **AFTER** setting options
                                    newData = Filter.useFilter(data, ntn); // apply fil

                                } else {
                                    StringToNominal ntn = new StringToNominal();
                                    String[] options = new String[2];
                                    options[0] = "-R"; // "range"
                                    options[1] = "2"; // first attribute
                                    ntn.setOptions(options); // set options
                                    ntn.setInputFormat(data); // inform filter about dataset
                                    // **AFTER** setting options
                                    newData = Filter.useFilter(data, ntn); // apply fil

                                }

                                String[] options = new String[1];
                                options[0] = "-P";
                                J48 tree = new J48(); // new instance of tree
                                tree.setOptions(options); // set the options

                                if (!tree.getCapabilities().test(newData)) {
                                    System.err.println("ERROR the FOLLOWING DATA CANNOT BE PROCESED:"
                                            + newData.toSummaryString());
                                    System.err.println("QUERY WAS:" + sqlstring);
                                } else {
                                    tree.buildClassifier(newData); // build classifier

                                }
                                System.out.println("CLASSIFICATION CONFIDENCE:  " + tree.getConfidenceFactor()
                                        + "\n " + tree.toString());

                            } catch (Exception e) {
                                // TODO Auto-generated catch block
                                e.printStackTrace();
                            }

                        } else {
                            System.out.println("Testing " + str + "." + sc.colname
                                    + ", g.partition correlation: " + correlation + " (LOW)");
                        }
                    }
                }
            }
        }

    } catch (SQLException e) {
        e.printStackTrace();
    }

}