Example usage for weka.core Instance setDataset

List of usage examples for weka.core Instance setDataset

Introduction

In this page you can find the example usage for weka.core Instance setDataset.

Prototype

public void setDataset(Instances instances);

Source Link

Document

Sets the reference to the dataset.

Usage

From source file:gr.ntua.sentimentanalysis.VectorModelSentimentAnalysis.java

License:Open Source License

@WebMethod(operationName = "getTweetSentiment")
public String getTextSentiment(String document) {
    Instance instance = vmcl.getInstance(-1, REP_MODEL, document);
    instance.setDataset(instances);

    int response = -1;
    try {//w ww . ja v a  2  s  .c o  m
        response = (int) classifier.classifyInstance(instance);
    } catch (Exception e) {
        e.printStackTrace();
    }

    if (response == 0) {
        return "negative";
    } else if (response == 1) {
        return "positive";
    } else {
        return "unknown";
    }
}

From source file:GroupProject.DMChartUI.java

/**
* Action for the generate button/*from  w  w  w.  jav a  2 s .  c  om*/
* It reads the user input from the table and the selected options and performs
* a classifiecation of the user input
* the user can choose linear regression, naive bayes classifier, or j48 trees to classify 
*
*/
private void generateButtonActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_generateButtonActionPerformed
    // TODO add your handling code here:                                              
    // TODO add your handling code here:
    //File file = new File("studentTemp.csv");
    CSVtoArff converter = new CSVtoArff();
    Instances students = null;
    Instances students2 = null;
    try {
        converter.convert("studentTemp.csv", "studentTemp.arff");
    } catch (IOException ex) {
        Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
    }

    try {
        students = new Instances(new BufferedReader(new FileReader("studentTemp.arff")));
        students2 = new Instances(new BufferedReader(new FileReader("studentTemp.arff")));
    } catch (IOException ex) {
        Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
    }

    //get column to predict values for 
    //int target=students.numAttributes()-1; 
    int target = dataSelector.getSelectedIndex() + 1;
    System.out.printf("this is the target: %d\n", target);
    //set target 
    students.setClassIndex(target);
    students2.setClassIndex(target);

    //case on which radio button is selected 
    //Linear Regressions
    if (LRB.isSelected()) {

        LinearRegression model = null;
        if (Lmodel != null) {
            model = Lmodel;
        } else {
            buildLinearModel();
            model = Lmodel;
        }

        System.out.println("im doing linear regression");

        equationDisplayArea.setText(model.toString());

        System.out.println("im going to get the instance");

        Instance prediction2 = getInstance(true);

        Remove remove = new Remove();
        int[] toremove = { 0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 };
        remove.setAttributeIndicesArray(toremove);

        try {
            remove.setInputFormat(students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        Instances instNew = null;
        try {
            instNew = Filter.useFilter(students, remove);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        prediction2.setDataset(instNew);
        System.err.print("i got the instance");
        double result = 0;
        try {
            result = model.classifyInstance(prediction2);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        System.out.printf("the result : %f \n ", result);
        predictValue.setText(Double.toString(result));
        System.out.println("I'm done with Linear Regression");
    }

    //Naive Bayes
    else if (NBB.isSelected()) {
        Classifier cModel = null;

        if (NBmodel != null) {
            cModel = NBmodel;
        } else {
            buildNBClassifier();
            cModel = NBmodel;
        }

        System.out.println("im doing NB");

        //build test 
        Evaluation eTest = null;
        try {
            eTest = new Evaluation(students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        System.out.println("Using NB");

        try {
            eTest.evaluateModel(cModel, students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        //display the test results to console 
        String strSummary = eTest.toSummaryString();
        System.out.println(strSummary);

        //build instance to predict 
        System.out.println("im going to get the instance");

        Instance prediction2 = getInstance(false);

        prediction2.setDataset(students);
        System.err.print("i got the instance");

        //replace with loop stating the class names 
        //fit text based on name of categories 
        double pred = 0;
        try {
            pred = cModel.classifyInstance(prediction2);
            prediction2.setClassValue(pred);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        //get the predicted value and set predictValue to it 
        predictValue.setText(prediction2.classAttribute().value((int) pred));

        System.out.println("I'm done with Naive Bayes");

        double[] fDistribution2 = null;
        try {
            fDistribution2 = cModel.distributionForInstance(prediction2);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        double max = 0;
        int maxindex = 0;
        max = fDistribution2[0];
        for (int i = 0; i < fDistribution2.length; i++) {
            if (fDistribution2[i] > max) {
                maxindex = i;
                max = fDistribution2[i];
            }
            System.out.println("the value at " + i + " : " + fDistribution2[i]);
            System.out.println("the label at " + i + prediction2.classAttribute().value(i));
        }
        prediction2.setClassValue(maxindex);
        predictValue.setText(prediction2.classAttribute().value(maxindex));

    }
    //J48 Tree
    else if (JB.isSelected()) {

        System.out.println("im doing j48 ");

        Classifier jModel = null;
        if (Jmodel != null) {
            jModel = Jmodel;
        } else {
            buildJClassifier();
            jModel = Jmodel;
        }
        //test model 
        Evaluation eTest2 = null;
        try {
            eTest2 = new Evaluation(students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        System.out.println("Using J48 test");
        try {
            eTest2.evaluateModel(jModel, students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        String strSummary2 = eTest2.toSummaryString();
        System.out.println(strSummary2);

        System.out.println("im going to get the instance");

        Instance prediction2 = getInstance(false);

        prediction2.setDataset(students);
        System.err.print("i got the instance\n");

        double pred = 0;
        try {
            pred = jModel.classifyInstance(prediction2);
            prediction2.setClassValue(pred);
            System.out.println("i did a prediction");
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        //get the predicted value and set predictValue to it 
        System.out.println("this was pred:" + pred);
        predictValue.setText(prediction2.classAttribute().value((int) pred));

        System.out.println("I'm done with J48");
        //replace with loop stating the class names 
        //fit text based on name of categories 

        double[] fDistribution2 = null;
        try {
            fDistribution2 = jModel.distributionForInstance(prediction2);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        double max = 0;
        int maxindex = 0;
        max = fDistribution2[0];
        for (int i = 0; i < fDistribution2.length; i++) {
            if (fDistribution2[i] > max) {
                maxindex = i;
                max = fDistribution2[i];
            }
            System.out.println("the value at " + i + " : " + fDistribution2[i]);
            System.out.println("the label at " + i + " " + prediction2.classAttribute().value(i));
        }
        prediction2.setClassValue(maxindex);
        predictValue.setText(prediction2.classAttribute().value(maxindex));

    }

}

From source file:ia02classificacao.IA02Classificacao.java

/**
 * @param args the command line arguments
 *///from   w w  w  .  j a v a 2 s. c om
public static void main(String[] args) throws Exception {

    // abre o banco de dados arff e mostra a quantidade de instancias (linhas)
    DataSource arquivo = new DataSource("data/zoo.arff");
    Instances dados = arquivo.getDataSet();
    System.out.println("Instancias lidas: " + dados.numInstances());

    // FILTER: remove o atributo nome do animal da classificao
    String[] parametros = new String[] { "-R", "1" };
    Remove filtro = new Remove();
    filtro.setOptions(parametros);
    filtro.setInputFormat(dados);
    dados = Filter.useFilter(dados, filtro);

    AttributeSelection selAtributo = new AttributeSelection();
    InfoGainAttributeEval avaliador = new InfoGainAttributeEval();
    Ranker busca = new Ranker();
    selAtributo.setEvaluator(avaliador);
    selAtributo.setSearch(busca);
    selAtributo.SelectAttributes(dados);
    int[] indices = selAtributo.selectedAttributes();
    System.out.println("Selected attributes: " + Utils.arrayToString(indices));

    // Usa o algoritimo J48 e mostra a classificao dos dados em forma textual
    String[] opcoes = new String[1];
    opcoes[0] = "-U";
    J48 arvore = new J48();
    arvore.setOptions(opcoes);
    arvore.buildClassifier(dados);
    System.out.println(arvore);

    // Usa o algoritimo J48 e mostra a classificao de dados em forma grafica
    /*
    TreeVisualizer tv = new TreeVisualizer(null, arvore.graph(), new PlaceNode2());
    JFrame frame = new javax.swing.JFrame("?rvore de Conhecimento");
    frame.setSize(800,500);
    frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
    frame.getContentPane().add(tv);
    frame.setVisible(true);
    tv.fitToScreen();
    */

    /*
    * Classificao de novos dados
    */

    System.out.println("\n\nCLASSIFICAO DE NOVOS DADOS");
    // criar atributos
    double[] vals = new double[dados.numAttributes()];
    vals[0] = 1.0; // hair
    vals[1] = 0.0; // feathers
    vals[2] = 0.0; // eggs
    vals[3] = 1.0; // milk
    vals[4] = 1.0; // airborne
    vals[5] = 0.0; // aquatic
    vals[6] = 0.0; // predator
    vals[7] = 1.0; // toothed
    vals[8] = 1.0; // backbone
    vals[9] = 1.0; // breathes
    vals[10] = 0.0; // venomous
    vals[11] = 0.0; // fins
    vals[12] = 4.0; // legs
    vals[13] = 1.0; // tail
    vals[14] = 1.0; // domestic
    vals[15] = 1.0; // catsize

    // Criar uma instncia baseada nestes atributos
    Instance meuUnicornio = new DenseInstance(1.0, vals);

    // Adicionar a instncia nos dados
    meuUnicornio.setDataset(dados);

    // Classificar esta nova instncia
    double label = arvore.classifyInstance(meuUnicornio);

    // Imprimir o resultado da classificao
    System.out.println("Novo Animal: Unicrnio");
    System.out.println("classificacao: " + dados.classAttribute().value((int) label));

    /*
    * Avaliao e predio de erros de mtrica
    */
    System.out.println("\n\nAVALIAO E PREDIO DE ERROS DE MTRICA");
    Classifier cl = new J48();
    Evaluation eval_roc = new Evaluation(dados);
    eval_roc.crossValidateModel(cl, dados, 10, new Random(1), new Object[] {});
    System.out.println(eval_roc.toSummaryString());

    /*
    * Matriz de confuso
    */
    System.out.println("\n\nMATRIZ DE CONFUSO");
    double[][] confusionMatrix = eval_roc.confusionMatrix();
    System.out.println(eval_roc.toMatrixString());

}

From source file:ia03classificador.jFrClassificador.java

public void doClassificate() throws Exception {

    // Quando clicado, a variavel recebe 1, quando no clicado recebe 0
    v00 = ((btn00.isSelected()) ? ((double) 1) : ((double) 0));
    v01 = ((btn01.isSelected()) ? ((double) 1) : ((double) 0));
    v02 = ((btn02.isSelected()) ? ((double) 1) : ((double) 0));
    v03 = ((btn03.isSelected()) ? ((double) 1) : ((double) 0));
    v04 = ((btn04.isSelected()) ? ((double) 1) : ((double) 0));
    v05 = ((btn05.isSelected()) ? ((double) 1) : ((double) 0));
    v06 = ((btn06.isSelected()) ? ((double) 1) : ((double) 0));
    v07 = ((btn07.isSelected()) ? ((double) 1) : ((double) 0));
    v08 = ((btn08.isSelected()) ? ((double) 1) : ((double) 0));
    v09 = ((btn09.isSelected()) ? ((double) 1) : ((double) 0));
    v10 = ((btn10.isSelected()) ? ((double) 1) : ((double) 0));
    v11 = ((btn11.isSelected()) ? ((double) 1) : ((double) 0));
    v13 = ((btn13.isSelected()) ? ((double) 1) : ((double) 0));
    v14 = ((btn14.isSelected()) ? ((double) 1) : ((double) 0));
    v15 = ((btn15.isSelected()) ? ((double) 1) : ((double) 0));
    legs = txtLegs.getText();/*from  w ww. java 2 s .  c o  m*/
    legs = ((legs == null || legs.trim().isEmpty() ? "2" : legs));
    name = txtName.getText();

    // abre o banco de dados arff e guarda os registros no objeto dados
    ConverterUtils.DataSource arquivo = new ConverterUtils.DataSource("data/zoo.arff");
    Instances dados = arquivo.getDataSet();

    // FILTER: remove o atributo nome do animal da classificao
    String[] parametros = new String[] { "-R", "1" };
    Remove filtro = new Remove();
    filtro.setOptions(parametros);
    filtro.setInputFormat(dados);
    dados = Filter.useFilter(dados, filtro);

    AttributeSelection selAtributo = new AttributeSelection();
    InfoGainAttributeEval avaliador = new InfoGainAttributeEval();
    Ranker busca = new Ranker();
    selAtributo.setEvaluator(avaliador);
    selAtributo.setSearch(busca);
    selAtributo.SelectAttributes(dados);
    int[] indices = selAtributo.selectedAttributes();
    //System.out.println("Selected attributes: " + Utils.arrayToString(indices));

    // Usa o algoritimo J48 para montar a arvore de dados
    String[] opcoes = new String[1];
    opcoes[0] = "-U";
    J48 arvore = new J48();
    arvore.setOptions(opcoes);
    arvore.buildClassifier(dados);

    // cria o novo elemento para comparao
    double[] vals = new double[dados.numAttributes()];
    vals[0] = v00; // hair
    vals[1] = v01; // feathers
    vals[2] = v02; // eggs
    vals[3] = v03; // milk
    vals[4] = v04; // airborne
    vals[5] = v05; // aquatic
    vals[6] = v06; // predator
    vals[7] = v07; // toothed
    vals[8] = v08; // backbone
    vals[9] = v09; // breathes
    vals[10] = v10; // venomous
    vals[11] = v11; // fins
    vals[12] = Double.parseDouble(legs); // legs
    vals[13] = v13; // tail
    vals[14] = v14; // domestic
    vals[15] = v15; // catsize

    // Criar uma instncia baseada nestes atributos
    Instance newAnimal = new DenseInstance(1.0, vals);

    // Adicionar a instncia nos dados
    newAnimal.setDataset(dados);

    // Classificar esta nova instncia
    double label = arvore.classifyInstance(newAnimal);

    // Imprimir o resultado da classificao
    lblClassification.setText(dados.classAttribute().value((int) label));

}

From source file:intensityclustering.IntensityClustering.java

/**
 * Draws the 2D Histogram Plot in the IntensityClustering. X-Axsis is
 * intensity value of chanel 2 image (where the stained nuclei are). Y-axis
 * are relative frequencies of present nuclei.
 *
 * @param tss The TMAspots whose nuclei are considered (both gold-standard
 * and estimated nuclei)./*from  w w w .  j a  va 2s . c  om*/
 * @param doAlsoClustering If true, the TMApoints are also clustered
 * according to the histogram.
 */
void drawNucleiIntensities2D(List<TMAspot> tss, boolean doAlsoClustering) {
    // draw the plot
    Plot2DPanel plot;
    if (((java.awt.BorderLayout) (jPanel9.getLayout()))
            .getLayoutComponent(java.awt.BorderLayout.CENTER) != null) {
        plot = (Plot2DPanel) ((java.awt.BorderLayout) (jPanel9.getLayout()))
                .getLayoutComponent(java.awt.BorderLayout.CENTER);
        plot.removeAllPlots();
        plot.removeAllPlotables();
    } else {
        plot = new Plot2DPanel(PlotPanel.SOUTH);
        plot.setAxisLabels("Intensity", "Frequency");
        plot.plotCanvas.setBackground(jPanel9.getBackground());
        plot.plotLegend.setBackground(jPanel9.getBackground());
        plot.plotToolBar.setBackground(plot.plotCanvas.getBackground());
    }
    if (((java.awt.BorderLayout) (jPanel9.getLayout()))
            .getLayoutComponent(java.awt.BorderLayout.CENTER) == null) {
        jPanel9.add(plot, java.awt.BorderLayout.CENTER);
        jPanel15.setBackground(plot.plotCanvas.getBackground());
        jPanel15.setVisible(true);
        validate();
        pack();
    }

    if (tss.size() > 0) {
        try {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

            List<Integer> intensities = new ArrayList<>();
            int intensity;
            int min = Integer.parseInt(jTextField1.getText());
            int max = Integer.parseInt(jTextField16.getText());
            for (TMAspot ts : tss) {
                //TODO: GET THE CHANNEL 2 Image
                //BufferedImage img = ts.getBufferedImage(TMAspot.SHOW_CHANNEL2_IMAGE, false);
                BufferedImage img = ts.getBufferedImage(false);
                // img can be null if color deconvolution has not been performed, yet.
                if (img != null) {
                    List<TMApoint> tps = ts.getPoints();
                    for (TMALabel tp : tps) {
                        intensity = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false)
                                .getRed();
                        if (intensity >= min && intensity <= max) {
                            intensities.add(intensity);
                        }
                    }
                }
            }

            double[] intensities_array = new double[intensities.size()];

            for (int i = 0; i < intensities.size(); i++) {
                intensities_array[i] = intensities.get(i);
            }

            int nbins = jSlider7.getValue();
            if (intensities_array.length > 0) {
                plot.addHistogramPlot("TMA points", intensities_array, 0, 256, nbins);
            } //else {
              //  JOptionPane.showMessageDialog(this, "No TMA points have been found.", "No TMA points found.", JOptionPane.WARNING_MESSAGE);
              //}

            //// Cluster Points according to histograms
            if (doAlsoClustering) {
                // Find Clusters
                int n = getParam_nClusters();

                // Create ARFF Data
                FastVector atts;
                Instances data;
                int i;

                // 1. create arff data format
                atts = new FastVector(1);
                for (i = 0; i < 1; i++) {
                    atts.addElement(new Attribute(Integer.toString(i)));
                }

                // 2. create Instances object
                data = new Instances("TMA points", atts, tmarker.getNumberNuclei(tss));

                // 3. fill with data
                for (i = 0; i < intensities_array.length; i++) {
                    // add the instance
                    Instance inst = new Instance(1.0, new double[] { intensities_array[i] });
                    inst.setDataset(data);
                    data.add(inst);
                }

                // 4. set data class index (last attribute is the class)
                //data.setClassIndex(data.numAttributes() - 1); // not for weka 3.5.X
                if (tmarker.DEBUG > 4) {
                    java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                            data.toString());
                }

                Clusterer clusterer = getClusterer();
                String[] options = getClustererOptions();

                if (tmarker.DEBUG > 3) {
                    if (options.length > 0) {
                        String info = "Clusterer should have options:\n";
                        for (String o : options) {
                            info += o + " ";
                        }
                        info += "\n";
                        java.util.logging.Logger.getLogger(getClass().getName())
                                .log(java.util.logging.Level.INFO, info);
                    }
                }

                clusterer.setOptions(options); // set the clusterer options
                clusterer.buildClusterer(data); // build the clusterer

                // order the clusters according to the brightness
                // The most bright cluster should be 0, then 1, then 2,...
                ArrayList<ArrayList<Double>> values = new ArrayList<>();
                for (i = 0; i < n; i++) {
                    values.add(new ArrayList<Double>());
                }
                int z;
                double value;
                for (i = 0; i < data.numInstances(); i++) {
                    z = clusterer.clusterInstance(data.instance(i));
                    value = data.instance(i).value(0);
                    values.get(z).add(value);
                }
                double[] means = new double[n];
                double[] stds = new double[n];
                for (i = 0; i < n; i++) {
                    means[i] = Misc.mean(values.get(i).toArray(new Double[values.get(i).size()]));
                    stds[i] = Misc.std(values.get(i).toArray(new Double[values.get(i).size()]));
                }
                int[] ordering = Misc.orderArray(means, true);

                for (i = 0; i < n; i++) {
                    int ind = Misc.IndexOf(ordering, i);
                    plot.addPlotable(new Line(getParam_ColorOfClassK(i),
                            new double[] { means[ind], plot.plotCanvas.base.roundXmin[1] },
                            new double[] { means[ind], plot.plotCanvas.base.roundXmax[1] }, 2 * stds[ind]));
                    plot.addPlot(Plot2DPanel.LINE, "Staining " + i, getParam_ColorOfClassK(i),
                            new double[][] { new double[] { means[ind], plot.plotCanvas.base.roundXmin[1] },
                                    new double[] { means[ind], plot.plotCanvas.base.roundXmax[1] } });
                }

                String clusterInfo = "";
                for (String o : clusterer.getOptions()) {
                    clusterInfo += o + " ";
                }
                clusterInfo += "\n\n";
                clusterInfo += clusterer.toString().trim();
                if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                    try {
                        clusterInfo += ((HierarchicalClusterer) clusterer).graph();
                        HierarchyVisualizer a = new HierarchyVisualizer(
                                ((HierarchicalClusterer) clusterer).graph());
                        a.setSize(800, 600);
                        if (clusterVisualizer == null) {
                            clusterVisualizer = new JFrame("Hierarchical Clusterer Dendrogram");
                            clusterVisualizer.setIconImage(getIconImage());
                            clusterVisualizer.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE);
                            clusterVisualizer.setSize(800, 600);
                        }
                        Container contentPane = clusterVisualizer.getContentPane();
                        contentPane.removeAll();
                        contentPane.add(a);
                    } catch (Exception e) {
                        clusterVisualizer = null;
                    }
                }
                jTextArea1.setText(clusterInfo);

                if (tmarker.DEBUG > 3) {
                    String info = "Clusterer has options\n";
                    for (String o : clusterer.getOptions()) {
                        info += o + " ";
                    }
                    info += "\n";
                    info += clusterer.toString() + "\n";
                    // info += (clusterer).globalInfo() + "\n";
                    info += "\n";
                    info += clusterInfo + "\n";
                    java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                            info);
                }

                // cluster all TMAspots and assign the corresponding class to them
                // Cluster the points
                List<List<Integer>> clustered_points = new ArrayList<>();
                for (i = 0; i < n; i++) {
                    clustered_points.add(new ArrayList<Integer>());
                }

                int k;
                for (TMAspot ts : tss) {
                    //TODO: GET THE CHANNEL 2 IMAGE
                    //BufferedImage img = ts.getBufferedImage(TMAspot.SHOW_CHANNEL2_IMAGE, false);
                    BufferedImage img = ts.getBufferedImage(false);
                    List<TMApoint> tps = ts.getPoints();
                    for (TMApoint tp : tps) {
                        intensity = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false)
                                .getRed();

                        // add the instance
                        Instance inst = new Instance(1.0, new double[] { intensity });
                        inst.setDataset(data);
                        k = ordering[clusterer.clusterInstance(inst)];

                        // store the color for later visualization
                        clustered_points.get(k).add(intensity);

                        // set the staining of the TMApoint
                        switch (k) {
                        case 0:
                            tp.setStaining(TMALabel.STAINING_0);
                            break;
                        case 1:
                            tp.setStaining(TMALabel.STAINING_1);
                            break;
                        case 2:
                            tp.setStaining(TMALabel.STAINING_2);
                            break;
                        default:
                            tp.setStaining(TMALabel.STAINING_3);
                            break;
                        }
                    }
                    ts.dispStainingInfo();
                    if (manager.getVisibleTMAspot() == ts) {
                        manager.repaintVisibleTMAspot();
                    }
                }

                // Write the description
                String description = "Nuclei clustered with " + getParam_AutomaticClustererString();
                if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                    description += " (" + getParam_HierarchicalClusteringMethod() + ")";
                }
                description += ", n=" + getParam_nClusters() + ", channel 2 intensity.";
                jLabel42.setText(description);
                jLabel41.setText(" ");

            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR));
        }
    }
}

From source file:intensityclustering.IntensityClustering.java

/**
 * Clusters the TMApoints on given TMAspots according to their staining
 * intensity (color). All parameters (e.g. clusterer and parameters) are
 * selected by the user. Features are simple color features.
 *
 * @param tss The TMAspots of which all nuclei (gold-standard and estimated)
 * are clustered according to color./* w w  w  .  j ava 2  s.c  om*/
 */
private void clusterPointsAutomaticallyColorSpace(List<TMAspot> tss) {
    if (tss.size() > 0) {
        try {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

            int n = getParam_nClusters();

            // Create ARFF Data
            FastVector atts;
            Instances data;
            int i;

            // 1. create arff data format
            atts = new FastVector(3);
            for (i = 0; i < 3; i++) {
                atts.addElement(new Attribute(Integer.toString(i)));
            }

            // 2. create Instances object
            data = new Instances("TMA points", atts, tmarker.getNumberNuclei(tss));

            // 3. fill with data
            BufferedImage img;
            Color c;
            float[] features = new float[3];
            String colorSpace = getParam_ColorSpace();
            for (TMAspot ts : tss) {
                img = ts.getBufferedImage();
                List<TMApoint> tps = ts.getPoints();
                for (TMApoint tp : tps) {
                    Color2Feature(TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false),
                            colorSpace, features);

                    // add the instance
                    Instance inst = new Instance(1.0, new double[] { features[0], features[1], features[2] });
                    inst.setDataset(data);
                    data.add(inst);
                }
            }

            // 4. set data class index (last attribute is the class)
            //data.setClassIndex(data.numAttributes() - 1); // not for weka 3.5.X
            if (tmarker.DEBUG > 4) {
                java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                        data.toString());
            }

            Clusterer clusterer = getClusterer();
            String[] options = getClustererOptions();
            if (false && colorSpace.equalsIgnoreCase("hsb")) {
                String[] newoptions = new String[options.length + 2];
                System.arraycopy(options, 0, newoptions, 0, options.length);
                newoptions[options.length] = "-A";
                newoptions[options.length + 1] = "weka.core.MyHSBDistance";
                options = newoptions;
            }

            if (tmarker.DEBUG > 3) {
                if (options.length > 0) {
                    String info = "Clusterer should have options\n";
                    for (String o : options) {
                        info += o + " ";
                    }
                    info += "\n";
                    java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                            info);
                }
            }

            clusterer.setOptions(options); // set the clusterer options
            clusterer.buildClusterer(data); // build the clusterer

            // order the clusters according to the brightness
            // The most bright cluster should be 0, then 1, then 2,...
            ArrayList<ArrayList<Double>> values = new ArrayList<>();
            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                values.add(new ArrayList<Double>());
            }
            int z;
            double value;
            for (i = 0; i < data.numInstances(); i++) {
                z = clusterer.clusterInstance(data.instance(i));
                value = getParam_ColorSpace().equalsIgnoreCase("hsb") ? data.instance(i).value(2)
                        : Misc.RGBToGray(data.instance(i).value(0), data.instance(i).value(1),
                                data.instance(i).value(2));
                values.get(z).add(value);
            }
            double[] means = new double[clusterer.numberOfClusters()];
            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                means[i] = Misc.mean(values.get(i).toArray(new Double[values.get(i).size()]));
            }
            int[] ordering = Misc.orderArray(means, !getParam_ColorSpace().equalsIgnoreCase("rtp"));

            String clusterInfo = "";
            for (String o : clusterer.getOptions()) {
                clusterInfo += o + " ";
            }
            clusterInfo += "\n\n";
            clusterInfo += clusterer.toString().trim();
            if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                try {
                    clusterInfo += ((HierarchicalClusterer) clusterer).graph();
                    HierarchyVisualizer a = new HierarchyVisualizer(
                            ((HierarchicalClusterer) clusterer).graph());
                    a.setSize(800, 600);
                    if (clusterVisualizer == null) {
                        clusterVisualizer = new JFrame("Hierarchical Clusterer Dendrogram");
                        clusterVisualizer.setIconImage(getIconImage());
                        clusterVisualizer.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE);
                        clusterVisualizer.setSize(800, 600);
                    }
                    Container contentPane = clusterVisualizer.getContentPane();
                    contentPane.removeAll();
                    contentPane.add(a);
                } catch (Exception e) {
                    clusterVisualizer = null;
                }
            }
            jTextArea1.setText(clusterInfo);

            if (tmarker.DEBUG > 3) {
                String info = "Clusterer has options\n";
                for (String o : clusterer.getOptions()) {
                    info += o + " ";
                }
                info += "\n";
                info += clusterer.toString() + "\n";
                // info += (clusterer).globalInfo() + "\n";
                info += "\n";
                info += clusterInfo + "\n";
                java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                        info);
            }

            // cluster all TMAspots and assign the corresponding class to them
            // Cluster the points
            List<List<Color>> clustered_points = new ArrayList<>();
            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                clustered_points.add(new ArrayList<Color>());
            }

            int k;
            for (TMAspot ts : tss) {
                img = ts.getBufferedImage();
                List<TMApoint> tps = ts.getPoints();
                for (TMApoint tp : tps) {
                    c = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false);
                    Color2Feature(c, colorSpace, features);

                    // add the instance
                    Instance inst = new Instance(1.0, new double[] { features[0], features[1], features[2] });
                    inst.setDataset(data);
                    k = ordering[clusterer.clusterInstance(inst)];

                    // store the color for later visualization
                    clustered_points.get(k).add(c);

                    // set the staining of the TMApoint
                    switch (k) {
                    case 0:
                        tp.setStaining(TMALabel.STAINING_0);
                        break;
                    case 1:
                        tp.setStaining(TMALabel.STAINING_1);
                        break;
                    case 2:
                        tp.setStaining(TMALabel.STAINING_2);
                        break;
                    default:
                        tp.setStaining(TMALabel.STAINING_3);
                        break;
                    }
                }
                ts.dispStainingInfo();
                if (manager.getVisibleTMAspot() == ts) {
                    manager.repaintVisibleTMAspot();
                }
            }

            // draw the points
            Plot3DPanel plot;
            if (((java.awt.BorderLayout) (jPanel2.getLayout()))
                    .getLayoutComponent(java.awt.BorderLayout.CENTER) != null) {
                plot = (Plot3DPanel) ((java.awt.BorderLayout) (jPanel2.getLayout()))
                        .getLayoutComponent(java.awt.BorderLayout.CENTER);
                plot.removeAllPlots();
            } else {
                plot = new Plot3DPanel();
                plot.plotCanvas.setBackground(jPanel2.getBackground());
                plot.addLegend(PlotPanel.SOUTH);
                plot.plotLegend.setBackground(jPanel2.getBackground());
            }
            if (colorSpace.equalsIgnoreCase("hsb")) {
                plot.setAxisLabels("Hue", "Saturation", "Brightness");
            } else if (colorSpace.equalsIgnoreCase("rtp")) {
                plot.setAxisLabels("R", "Theta", "Phi");
            } else {
                plot.setAxisLabels("Red", "Green", "Blue");
            }

            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                double[] xs = new double[clustered_points.get(i).size()];
                double[] ys = new double[clustered_points.get(i).size()];
                double[] zs = new double[clustered_points.get(i).size()];
                for (int j = 0; j < clustered_points.get(i).size(); j++) {
                    Color2Feature(clustered_points.get(i).get(j), colorSpace, features);
                    xs[j] = features[0];
                    ys[j] = features[1];
                    zs[j] = features[2];
                }
                if (xs.length > 0) {
                    c = getParam_ColorOfClassK(i);
                    plot.addScatterPlot("Staining " + i, c, xs, ys, zs);
                }
            }

            // Write the description
            String description = "Nuclei clustered with " + getParam_AutomaticClustererString();
            if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                description += " (" + getParam_HierarchicalClusteringMethod() + ")";
            }
            description += ", n=" + getParam_nClusters() + ", color space " + getParam_ColorSpace() + ".";
            jLabel41.setText(description);
            jLabel42.setText(" ");

            if (((java.awt.BorderLayout) (jPanel2.getLayout()))
                    .getLayoutComponent(java.awt.BorderLayout.CENTER) == null) {
                jPanel2.add(plot, java.awt.BorderLayout.CENTER);
                validate();
                pack();
            }
        } catch (Exception | OutOfMemoryError e) {
            java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.SEVERE, null,
                    e);
            JOptionPane.showMessageDialog(this,
                    "The clustering could not be performed.\n\n" + "A possible reasons is:\n"
                            + "- Not enough memory (too many points), \n\n"
                            + "You might want to try a different clustering method or less TMAspots.\n\n"
                            + "The error message is: \n" + e.getMessage(),
                    "Error at Nucleus clustering", JOptionPane.WARNING_MESSAGE);
        } finally {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR));
        }
    }
}

From source file:jwebminer2.FeatureValueFileSaver.java

/**
 * Save the given text to the given location in the given format or
 * save the stored feature values, depending on the chosen_file_extension.
 * A progress bar is displayed (although not incremented).
 *
 * @param chosen_file_extension The file extension (corresponding to one
 *                              of the extensions published by the
 *                              getFileFormatExtension method) to use when
 *                              saving data_to_save, and the corresponding
 *                              file format.
 * @param data_to_save          The HTML code displayed on-screen. May be
 *                              null for non-HTML saving.
 * @param save_location         The file to save data_to_save to.
 * @throws Exception            Throws an Exception if the file cannot be
 *                              saved./*from  ww  w.ja va2 s  .  c o m*/
 */
public void saveContents(String chosen_file_extension, String data_to_save, File save_location)
        throws Exception {
    // Prepare the progress bar
    SimpleProgressBarDialog progress_bar = new SimpleProgressBarDialog(1, results_panel);

    // Write the whole contents of data_to_save verbatim as an HTML file
    // if an HTML file is to be saved
    if (chosen_file_extension.equals("HTML")) {
        DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                .getDataOutputStream(save_location);
        writer.writeBytes(data_to_save);
        writer.close();
    }

    // Only save the table of final feature values itself if a non-HTML
    // file format is to be saved
    else {
        // Access information to store
        double[][] feature_table = results_panel.feature_values;

        String[] column_labels = results_panel.column_labels;
        String[] row_labels = results_panel.row_labels;
        String[] orig_column_labels = column_labels;

        if (AnalysisProcessor.lastfm_enabled && AnalysisProcessor.is_cross_tabulation
                && (AnalysisProcessor.yahoo_application_id != null
                        || AnalysisProcessor.google_license_key != null)) {
            String[] column_labels_lastfm_websearch = new String[2 * column_labels.length];
            for (int i = 0; i < column_labels.length; i++) {
                column_labels_lastfm_websearch[i] = column_labels[i] + "_WS";
                column_labels_lastfm_websearch[i + column_labels.length] = column_labels[i] + "_LastFM";
            }
            column_labels = column_labels_lastfm_websearch;
        } else {
            column_labels = orig_column_labels;
        }

        // Save as tab delimited text file
        if (chosen_file_extension.equals("TXT")) {
            // Calculate the table to save
            String[][] results_table = new String[row_labels.length + 1][column_labels.length + 1];
            results_table[0][0] = "";
            for (int i = 0; i < results_table.length; i++) {
                for (int j = 0; j < results_table[i].length; j++) {
                    if (i == 0) {
                        if (j != 0)
                            results_table[i][j] = column_labels[j - 1];
                    } else {
                        if (j == 0)
                            results_table[i][j] = row_labels[i - 1];
                        else
                            results_table[i][j] = String.valueOf(feature_table[i - 1][j - 1]);
                    }
                }
            }

            // Save the table
            DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                    .getDataOutputStream(save_location);
            for (int i = 0; i < results_table.length; i++) {
                for (int j = 0; j < results_table[i].length; j++) {
                    // Write the table entry
                    writer.writeBytes(results_table[i][j]);

                    // Add a tab or a line break
                    if (j == results_table[i].length - 1)
                        writer.writeBytes("\n");
                    else
                        writer.writeBytes("\t");
                }
            }

            // Close the writing stream
            writer.close();
        }

        // Save as ACE XML file
        else if (chosen_file_extension.equals("ACE XML")) {
            // Set the name of the dataset to the name of the file
            // that is tob be saved
            String data_set_name = mckay.utilities.staticlibraries.StringMethods
                    .removeExtension(save_location.getName());

            // Prepare feature definitions and store feature names to
            // put in DataSets
            FeatureDefinition[] feature_definitions = new FeatureDefinition[column_labels.length];
            String[] feature_names = new String[column_labels.length];
            for (int feat = 0; feat < feature_definitions.length; feat++) {
                feature_definitions[feat] = new FeatureDefinition(column_labels[feat], "", false, 1);
                feature_names[feat] = column_labels[feat];
            }

            // Prepare the the DataSets to write
            DataSet[] data_sets = new DataSet[row_labels.length];
            for (int instance = 0; instance < data_sets.length; instance++) {
                // Instantiate the DataSet
                data_sets[instance] = new DataSet();

                // Store the instance names
                data_sets[instance].identifier = row_labels[instance];

                // Store the names of the features
                data_sets[instance].feature_names = feature_names;

                // Store the features for this DataSet as well as the
                // feature names
                double[][] these_feature_values = new double[feature_table[instance].length][1];
                for (int feat = 0; feat < these_feature_values.length; feat++)
                    these_feature_values[feat][0] = feature_table[instance][feat];
                data_sets[instance].feature_values = these_feature_values;

                // Validate, order and compact the DataSet
                data_sets[instance].orderAndCompactFeatures(feature_definitions, true);
            }

            // Save the feature values
            DataSet.saveDataSets(data_sets, feature_definitions, save_location,
                    "Features extracted with jWebMiner 2.0");
        }

        // Save as Weka ARFF file
        else if (chosen_file_extension.equals("Weka ARFF")) {
            // Set the name of the dataset to the name of the file
            // that is to be saved
            String data_set_name = mckay.utilities.staticlibraries.StringMethods
                    .removeExtension(save_location.getName());

            // Set the Attributes (feature names and class names)
            FastVector attributes_vector = new FastVector(column_labels.length + 1); // extra 1 is for class name
            for (int feat = 0; feat < column_labels.length; feat++)
                attributes_vector.addElement(new Attribute(column_labels[feat]));
            FastVector class_names_vector = new FastVector(column_labels.length);
            for (int cat = 0; cat < orig_column_labels.length; cat++)
                class_names_vector.addElement(orig_column_labels[cat]);
            attributes_vector.addElement(new Attribute("Class", class_names_vector));

            // Store attributes in an Instances object
            Instances instances = new Instances(data_set_name, attributes_vector, row_labels.length);
            instances.setClassIndex(instances.numAttributes() - 1);

            // Store the feature values and model classifications
            for (int inst = 0; inst < row_labels.length; inst++) {
                // Initialize an instance
                Instance this_instance = new Instance(instances.numAttributes());
                this_instance.setDataset(instances);
                int current_attribute = 0;

                // Set feature values for the instance
                for (int feat = 0; feat < column_labels.length; feat++)
                    this_instance.setValue(feat, feature_table[inst][feat]);

                // Set the class value for the instance
                // this_instance.setClassValue("a");
                instances.setRelationName("jWebMiner2");

                // Add this instance to instances
                instances.add(this_instance);
            }

            // Prepare the buffer to save to and add comments indicating
            // the names of the rows
            DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                    .getDataOutputStream(save_location);
            writer.writeBytes("% INSTANCES (DATA ROWS) BELOW CORRESPOND TO:\n%\n");
            for (int inst = 0; inst < row_labels.length; inst++)
                writer.writeBytes("%    " + (inst + 1) + ") " + row_labels[inst] + "\n");
            writer.writeBytes("%\n");

            // Save the ARFF file
            ArffSaver arff_saver = new ArffSaver();
            arff_saver.setInstances(instances);
            arff_saver.setFile(save_location);
            arff_saver.setDestination(writer);
            try {
                arff_saver.writeBatch();
            } catch (Exception e) {
                throw new Exception(
                        "File only partially saved.\n\nTry resaving the file with a .arff extension.");
            }

            // Close the writer
            writer.close();
        }
    }

    // Terminate the progress bar
    progress_bar.done();
}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance./*w  w w . j a  v a 2s. c o  m*/
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./* www .  j a  va 2s .  co m*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kea.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found./*from  ww w.  j a va 2  s. c  om*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}