Example usage for weka.core Instance setDataset

Introduction

In this page you can find the example usage for weka.core Instance setDataset.

Prototype

public void setDataset(Instances instances);

Source Link

Document

Sets the reference to the dataset.

Usage

From source file:gr.ntua.sentimentanalysis.VectorModelSentimentAnalysis.java

License:Open Source License

@WebMethod(operationName = "getTweetSentiment")
public String getTextSentiment(String document) {
    Instance instance = vmcl.getInstance(-1, REP_MODEL, document);
    instance.setDataset(instances);

    int response = -1;
    try {//w ww . ja v a  2  s  .c o  m
        response = (int) classifier.classifyInstance(instance);
    } catch (Exception e) {
        e.printStackTrace();
    }

    if (response == 0) {
        return "negative";
    } else if (response == 1) {
        return "positive";
    } else {
        return "unknown";
    }
}

From source file:GroupProject.DMChartUI.java

/**
* Action for the generate button/*from  w  w  w.  jav a  2 s .  c  om*/
* It reads the user input from the table and the selected options and performs
* a classifiecation of the user input
* the user can choose linear regression, naive bayes classifier, or j48 trees to classify 
*
*/
private void generateButtonActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_generateButtonActionPerformed
    // TODO add your handling code here:                                              
    // TODO add your handling code here:
    //File file = new File("studentTemp.csv");
    CSVtoArff converter = new CSVtoArff();
    Instances students = null;
    Instances students2 = null;
    try {
        converter.convert("studentTemp.csv", "studentTemp.arff");
    } catch (IOException ex) {
        Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
    }

    try {
        students = new Instances(new BufferedReader(new FileReader("studentTemp.arff")));
        students2 = new Instances(new BufferedReader(new FileReader("studentTemp.arff")));
    } catch (IOException ex) {
        Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
    }

    //get column to predict values for 
    //int target=students.numAttributes()-1; 
    int target = dataSelector.getSelectedIndex() + 1;
    System.out.printf("this is the target: %d\n", target);
    //set target 
    students.setClassIndex(target);
    students2.setClassIndex(target);

    //case on which radio button is selected 
    //Linear Regressions
    if (LRB.isSelected()) {

        LinearRegression model = null;
        if (Lmodel != null) {
            model = Lmodel;
        } else {
            buildLinearModel();
            model = Lmodel;
        }

        System.out.println("im doing linear regression");

        equationDisplayArea.setText(model.toString());

        System.out.println("im going to get the instance");

        Instance prediction2 = getInstance(true);

        Remove remove = new Remove();
        int[] toremove = { 0, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 };
        remove.setAttributeIndicesArray(toremove);

        try {
            remove.setInputFormat(students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        Instances instNew = null;
        try {
            instNew = Filter.useFilter(students, remove);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        prediction2.setDataset(instNew);
        System.err.print("i got the instance");
        double result = 0;
        try {
            result = model.classifyInstance(prediction2);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        System.out.printf("the result : %f \n ", result);
        predictValue.setText(Double.toString(result));
        System.out.println("I'm done with Linear Regression");
    }

    //Naive Bayes
    else if (NBB.isSelected()) {
        Classifier cModel = null;

        if (NBmodel != null) {
            cModel = NBmodel;
        } else {
            buildNBClassifier();
            cModel = NBmodel;
        }

        System.out.println("im doing NB");

        //build test 
        Evaluation eTest = null;
        try {
            eTest = new Evaluation(students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        System.out.println("Using NB");

        try {
            eTest.evaluateModel(cModel, students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        //display the test results to console 
        String strSummary = eTest.toSummaryString();
        System.out.println(strSummary);

        //build instance to predict 
        System.out.println("im going to get the instance");

        Instance prediction2 = getInstance(false);

        prediction2.setDataset(students);
        System.err.print("i got the instance");

        //replace with loop stating the class names 
        //fit text based on name of categories 
        double pred = 0;
        try {
            pred = cModel.classifyInstance(prediction2);
            prediction2.setClassValue(pred);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        //get the predicted value and set predictValue to it 
        predictValue.setText(prediction2.classAttribute().value((int) pred));

        System.out.println("I'm done with Naive Bayes");

        double[] fDistribution2 = null;
        try {
            fDistribution2 = cModel.distributionForInstance(prediction2);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        double max = 0;
        int maxindex = 0;
        max = fDistribution2[0];
        for (int i = 0; i < fDistribution2.length; i++) {
            if (fDistribution2[i] > max) {
                maxindex = i;
                max = fDistribution2[i];
            }
            System.out.println("the value at " + i + " : " + fDistribution2[i]);
            System.out.println("the label at " + i + prediction2.classAttribute().value(i));
        }
        prediction2.setClassValue(maxindex);
        predictValue.setText(prediction2.classAttribute().value(maxindex));

    }
    //J48 Tree
    else if (JB.isSelected()) {

        System.out.println("im doing j48 ");

        Classifier jModel = null;
        if (Jmodel != null) {
            jModel = Jmodel;
        } else {
            buildJClassifier();
            jModel = Jmodel;
        }
        //test model 
        Evaluation eTest2 = null;
        try {
            eTest2 = new Evaluation(students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        System.out.println("Using J48 test");
        try {
            eTest2.evaluateModel(jModel, students);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }
        String strSummary2 = eTest2.toSummaryString();
        System.out.println(strSummary2);

        System.out.println("im going to get the instance");

        Instance prediction2 = getInstance(false);

        prediction2.setDataset(students);
        System.err.print("i got the instance\n");

        double pred = 0;
        try {
            pred = jModel.classifyInstance(prediction2);
            prediction2.setClassValue(pred);
            System.out.println("i did a prediction");
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        //get the predicted value and set predictValue to it 
        System.out.println("this was pred:" + pred);
        predictValue.setText(prediction2.classAttribute().value((int) pred));

        System.out.println("I'm done with J48");
        //replace with loop stating the class names 
        //fit text based on name of categories 

        double[] fDistribution2 = null;
        try {
            fDistribution2 = jModel.distributionForInstance(prediction2);
        } catch (Exception ex) {
            Logger.getLogger(DMChartUI.class.getName()).log(Level.SEVERE, null, ex);
        }

        double max = 0;
        int maxindex = 0;
        max = fDistribution2[0];
        for (int i = 0; i < fDistribution2.length; i++) {
            if (fDistribution2[i] > max) {
                maxindex = i;
                max = fDistribution2[i];
            }
            System.out.println("the value at " + i + " : " + fDistribution2[i]);
            System.out.println("the label at " + i + " " + prediction2.classAttribute().value(i));
        }
        prediction2.setClassValue(maxindex);
        predictValue.setText(prediction2.classAttribute().value(maxindex));

    }

}

From source file:ia02classificacao.IA02Classificacao.java

/**
 * @param args the command line arguments
 *///from   w w  w  .  j a v a 2 s. c om
public static void main(String[] args) throws Exception {

    // abre o banco de dados arff e mostra a quantidade de instancias (linhas)
    DataSource arquivo = new DataSource("data/zoo.arff");
    Instances dados = arquivo.getDataSet();
    System.out.println("Instancias lidas: " + dados.numInstances());

    // FILTER: remove o atributo nome do animal da classificao
    String[] parametros = new String[] { "-R", "1" };
    Remove filtro = new Remove();
    filtro.setOptions(parametros);
    filtro.setInputFormat(dados);
    dados = Filter.useFilter(dados, filtro);

    AttributeSelection selAtributo = new AttributeSelection();
    InfoGainAttributeEval avaliador = new InfoGainAttributeEval();
    Ranker busca = new Ranker();
    selAtributo.setEvaluator(avaliador);
    selAtributo.setSearch(busca);
    selAtributo.SelectAttributes(dados);
    int[] indices = selAtributo.selectedAttributes();
    System.out.println("Selected attributes: " + Utils.arrayToString(indices));

    // Usa o algoritimo J48 e mostra a classificao dos dados em forma textual
    String[] opcoes = new String[1];
    opcoes[0] = "-U";
    J48 arvore = new J48();
    arvore.setOptions(opcoes);
    arvore.buildClassifier(dados);
    System.out.println(arvore);

    // Usa o algoritimo J48 e mostra a classificao de dados em forma grafica
    /*
    TreeVisualizer tv = new TreeVisualizer(null, arvore.graph(), new PlaceNode2());
    JFrame frame = new javax.swing.JFrame("?rvore de Conhecimento");
    frame.setSize(800,500);
    frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
    frame.getContentPane().add(tv);
    frame.setVisible(true);
    tv.fitToScreen();
    */

    /*
    * Classificao de novos dados
    */

    System.out.println("\n\nCLASSIFICAO DE NOVOS DADOS");
    // criar atributos
    double[] vals = new double[dados.numAttributes()];
    vals[0] = 1.0; // hair
    vals[1] = 0.0; // feathers
    vals[2] = 0.0; // eggs
    vals[3] = 1.0; // milk
    vals[4] = 1.0; // airborne
    vals[5] = 0.0; // aquatic
    vals[6] = 0.0; // predator
    vals[7] = 1.0; // toothed
    vals[8] = 1.0; // backbone
    vals[9] = 1.0; // breathes
    vals[10] = 0.0; // venomous
    vals[11] = 0.0; // fins
    vals[12] = 4.0; // legs
    vals[13] = 1.0; // tail
    vals[14] = 1.0; // domestic
    vals[15] = 1.0; // catsize

    // Criar uma instncia baseada nestes atributos
    Instance meuUnicornio = new DenseInstance(1.0, vals);

    // Adicionar a instncia nos dados
    meuUnicornio.setDataset(dados);

    // Classificar esta nova instncia
    double label = arvore.classifyInstance(meuUnicornio);

    // Imprimir o resultado da classificao
    System.out.println("Novo Animal: Unicrnio");
    System.out.println("classificacao: " + dados.classAttribute().value((int) label));

    /*
    * Avaliao e predio de erros de mtrica
    */
    System.out.println("\n\nAVALIAO E PREDIO DE ERROS DE MTRICA");
    Classifier cl = new J48();
    Evaluation eval_roc = new Evaluation(dados);
    eval_roc.crossValidateModel(cl, dados, 10, new Random(1), new Object[] {});
    System.out.println(eval_roc.toSummaryString());

    /*
    * Matriz de confuso
    */
    System.out.println("\n\nMATRIZ DE CONFUSO");
    double[][] confusionMatrix = eval_roc.confusionMatrix();
    System.out.println(eval_roc.toMatrixString());

}

From source file:ia03classificador.jFrClassificador.java

public void doClassificate() throws Exception {

    // Quando clicado, a variavel recebe 1, quando no clicado recebe 0
    v00 = ((btn00.isSelected()) ? ((double) 1) : ((double) 0));
    v01 = ((btn01.isSelected()) ? ((double) 1) : ((double) 0));
    v02 = ((btn02.isSelected()) ? ((double) 1) : ((double) 0));
    v03 = ((btn03.isSelected()) ? ((double) 1) : ((double) 0));
    v04 = ((btn04.isSelected()) ? ((double) 1) : ((double) 0));
    v05 = ((btn05.isSelected()) ? ((double) 1) : ((double) 0));
    v06 = ((btn06.isSelected()) ? ((double) 1) : ((double) 0));
    v07 = ((btn07.isSelected()) ? ((double) 1) : ((double) 0));
    v08 = ((btn08.isSelected()) ? ((double) 1) : ((double) 0));
    v09 = ((btn09.isSelected()) ? ((double) 1) : ((double) 0));
    v10 = ((btn10.isSelected()) ? ((double) 1) : ((double) 0));
    v11 = ((btn11.isSelected()) ? ((double) 1) : ((double) 0));
    v13 = ((btn13.isSelected()) ? ((double) 1) : ((double) 0));
    v14 = ((btn14.isSelected()) ? ((double) 1) : ((double) 0));
    v15 = ((btn15.isSelected()) ? ((double) 1) : ((double) 0));
    legs = txtLegs.getText();/*from  w ww. java 2 s .  c o  m*/
    legs = ((legs == null || legs.trim().isEmpty() ? "2" : legs));
    name = txtName.getText();

    // abre o banco de dados arff e guarda os registros no objeto dados
    ConverterUtils.DataSource arquivo = new ConverterUtils.DataSource("data/zoo.arff");
    Instances dados = arquivo.getDataSet();

    // FILTER: remove o atributo nome do animal da classificao
    String[] parametros = new String[] { "-R", "1" };
    Remove filtro = new Remove();
    filtro.setOptions(parametros);
    filtro.setInputFormat(dados);
    dados = Filter.useFilter(dados, filtro);

    AttributeSelection selAtributo = new AttributeSelection();
    InfoGainAttributeEval avaliador = new InfoGainAttributeEval();
    Ranker busca = new Ranker();
    selAtributo.setEvaluator(avaliador);
    selAtributo.setSearch(busca);
    selAtributo.SelectAttributes(dados);
    int[] indices = selAtributo.selectedAttributes();
    //System.out.println("Selected attributes: " + Utils.arrayToString(indices));

    // Usa o algoritimo J48 para montar a arvore de dados
    String[] opcoes = new String[1];
    opcoes[0] = "-U";
    J48 arvore = new J48();
    arvore.setOptions(opcoes);
    arvore.buildClassifier(dados);

    // cria o novo elemento para comparao
    double[] vals = new double[dados.numAttributes()];
    vals[0] = v00; // hair
    vals[1] = v01; // feathers
    vals[2] = v02; // eggs
    vals[3] = v03; // milk
    vals[4] = v04; // airborne
    vals[5] = v05; // aquatic
    vals[6] = v06; // predator
    vals[7] = v07; // toothed
    vals[8] = v08; // backbone
    vals[9] = v09; // breathes
    vals[10] = v10; // venomous
    vals[11] = v11; // fins
    vals[12] = Double.parseDouble(legs); // legs
    vals[13] = v13; // tail
    vals[14] = v14; // domestic
    vals[15] = v15; // catsize

    // Criar uma instncia baseada nestes atributos
    Instance newAnimal = new DenseInstance(1.0, vals);

    // Adicionar a instncia nos dados
    newAnimal.setDataset(dados);

    // Classificar esta nova instncia
    double label = arvore.classifyInstance(newAnimal);

    // Imprimir o resultado da classificao
    lblClassification.setText(dados.classAttribute().value((int) label));

}

From source file:intensityclustering.IntensityClustering.java

/**
 * Draws the 2D Histogram Plot in the IntensityClustering. X-Axsis is
 * intensity value of chanel 2 image (where the stained nuclei are). Y-axis
 * are relative frequencies of present nuclei.
 *
 * @param tss The TMAspots whose nuclei are considered (both gold-standard
 * and estimated nuclei)./*from  w w w .  j a  va 2s . c  om*/
 * @param doAlsoClustering If true, the TMApoints are also clustered
 * according to the histogram.
 */
void drawNucleiIntensities2D(List<TMAspot> tss, boolean doAlsoClustering) {
    // draw the plot
    Plot2DPanel plot;
    if (((java.awt.BorderLayout) (jPanel9.getLayout()))
            .getLayoutComponent(java.awt.BorderLayout.CENTER) != null) {
        plot = (Plot2DPanel) ((java.awt.BorderLayout) (jPanel9.getLayout()))
                .getLayoutComponent(java.awt.BorderLayout.CENTER);
        plot.removeAllPlots();
        plot.removeAllPlotables();
    } else {
        plot = new Plot2DPanel(PlotPanel.SOUTH);
        plot.setAxisLabels("Intensity", "Frequency");
        plot.plotCanvas.setBackground(jPanel9.getBackground());
        plot.plotLegend.setBackground(jPanel9.getBackground());
        plot.plotToolBar.setBackground(plot.plotCanvas.getBackground());
    }
    if (((java.awt.BorderLayout) (jPanel9.getLayout()))
            .getLayoutComponent(java.awt.BorderLayout.CENTER) == null) {
        jPanel9.add(plot, java.awt.BorderLayout.CENTER);
        jPanel15.setBackground(plot.plotCanvas.getBackground());
        jPanel15.setVisible(true);
        validate();
        pack();
    }

    if (tss.size() > 0) {
        try {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

            List<Integer> intensities = new ArrayList<>();
            int intensity;
            int min = Integer.parseInt(jTextField1.getText());
            int max = Integer.parseInt(jTextField16.getText());
            for (TMAspot ts : tss) {
                //TODO: GET THE CHANNEL 2 Image
                //BufferedImage img = ts.getBufferedImage(TMAspot.SHOW_CHANNEL2_IMAGE, false);
                BufferedImage img = ts.getBufferedImage(false);
                // img can be null if color deconvolution has not been performed, yet.
                if (img != null) {
                    List<TMApoint> tps = ts.getPoints();
                    for (TMALabel tp : tps) {
                        intensity = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false)
                                .getRed();
                        if (intensity >= min && intensity <= max) {
                            intensities.add(intensity);
                        }
                    }
                }
            }

            double[] intensities_array = new double[intensities.size()];

            for (int i = 0; i < intensities.size(); i++) {
                intensities_array[i] = intensities.get(i);
            }

            int nbins = jSlider7.getValue();
            if (intensities_array.length > 0) {
                plot.addHistogramPlot("TMA points", intensities_array, 0, 256, nbins);
            } //else {
              //  JOptionPane.showMessageDialog(this, "No TMA points have been found.", "No TMA points found.", JOptionPane.WARNING_MESSAGE);
              //}

            //// Cluster Points according to histograms
            if (doAlsoClustering) {
                // Find Clusters
                int n = getParam_nClusters();

                // Create ARFF Data
                FastVector atts;
                Instances data;
                int i;

                // 1. create arff data format
                atts = new FastVector(1);
                for (i = 0; i < 1; i++) {
                    atts.addElement(new Attribute(Integer.toString(i)));
                }

                // 2. create Instances object
                data = new Instances("TMA points", atts, tmarker.getNumberNuclei(tss));

                // 3. fill with data
                for (i = 0; i < intensities_array.length; i++) {
                    // add the instance
                    Instance inst = new Instance(1.0, new double[] { intensities_array[i] });
                    inst.setDataset(data);
                    data.add(inst);
                }

                // 4. set data class index (last attribute is the class)
                //data.setClassIndex(data.numAttributes() - 1); // not for weka 3.5.X
                if (tmarker.DEBUG > 4) {
                    java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                            data.toString());
                }

                Clusterer clusterer = getClusterer();
                String[] options = getClustererOptions();

                if (tmarker.DEBUG > 3) {
                    if (options.length > 0) {
                        String info = "Clusterer should have options:\n";
                        for (String o : options) {
                            info += o + " ";
                        }
                        info += "\n";
                        java.util.logging.Logger.getLogger(getClass().getName())
                                .log(java.util.logging.Level.INFO, info);
                    }
                }

                clusterer.setOptions(options); // set the clusterer options
                clusterer.buildClusterer(data); // build the clusterer

                // order the clusters according to the brightness
                // The most bright cluster should be 0, then 1, then 2,...
                ArrayList<ArrayList<Double>> values = new ArrayList<>();
                for (i = 0; i < n; i++) {
                    values.add(new ArrayList<Double>());
                }
                int z;
                double value;
                for (i = 0; i < data.numInstances(); i++) {
                    z = clusterer.clusterInstance(data.instance(i));
                    value = data.instance(i).value(0);
                    values.get(z).add(value);
                }
                double[] means = new double[n];
                double[] stds = new double[n];
                for (i = 0; i < n; i++) {
                    means[i] = Misc.mean(values.get(i).toArray(new Double[values.get(i).size()]));
                    stds[i] = Misc.std(values.get(i).toArray(new Double[values.get(i).size()]));
                }
                int[] ordering = Misc.orderArray(means, true);

                for (i = 0; i < n; i++) {
                    int ind = Misc.IndexOf(ordering, i);
                    plot.addPlotable(new Line(getParam_ColorOfClassK(i),
                            new double[] { means[ind], plot.plotCanvas.base.roundXmin[1] },
                            new double[] { means[ind], plot.plotCanvas.base.roundXmax[1] }, 2 * stds[ind]));
                    plot.addPlot(Plot2DPanel.LINE, "Staining " + i, getParam_ColorOfClassK(i),
                            new double[][] { new double[] { means[ind], plot.plotCanvas.base.roundXmin[1] },
                                    new double[] { means[ind], plot.plotCanvas.base.roundXmax[1] } });
                }

                String clusterInfo = "";
                for (String o : clusterer.getOptions()) {
                    clusterInfo += o + " ";
                }
                clusterInfo += "\n\n";
                clusterInfo += clusterer.toString().trim();
                if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                    try {
                        clusterInfo += ((HierarchicalClusterer) clusterer).graph();
                        HierarchyVisualizer a = new HierarchyVisualizer(
                                ((HierarchicalClusterer) clusterer).graph());
                        a.setSize(800, 600);
                        if (clusterVisualizer == null) {
                            clusterVisualizer = new JFrame("Hierarchical Clusterer Dendrogram");
                            clusterVisualizer.setIconImage(getIconImage());
                            clusterVisualizer.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE);
                            clusterVisualizer.setSize(800, 600);
                        }
                        Container contentPane = clusterVisualizer.getContentPane();
                        contentPane.removeAll();
                        contentPane.add(a);
                    } catch (Exception e) {
                        clusterVisualizer = null;
                    }
                }
                jTextArea1.setText(clusterInfo);

                if (tmarker.DEBUG > 3) {
                    String info = "Clusterer has options\n";
                    for (String o : clusterer.getOptions()) {
                        info += o + " ";
                    }
                    info += "\n";
                    info += clusterer.toString() + "\n";
                    // info += (clusterer).globalInfo() + "\n";
                    info += "\n";
                    info += clusterInfo + "\n";
                    java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                            info);
                }

                // cluster all TMAspots and assign the corresponding class to them
                // Cluster the points
                List<List<Integer>> clustered_points = new ArrayList<>();
                for (i = 0; i < n; i++) {
                    clustered_points.add(new ArrayList<Integer>());
                }

                int k;
                for (TMAspot ts : tss) {
                    //TODO: GET THE CHANNEL 2 IMAGE
                    //BufferedImage img = ts.getBufferedImage(TMAspot.SHOW_CHANNEL2_IMAGE, false);
                    BufferedImage img = ts.getBufferedImage(false);
                    List<TMApoint> tps = ts.getPoints();
                    for (TMApoint tp : tps) {
                        intensity = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false)
                                .getRed();

                        // add the instance
                        Instance inst = new Instance(1.0, new double[] { intensity });
                        inst.setDataset(data);
                        k = ordering[clusterer.clusterInstance(inst)];

                        // store the color for later visualization
                        clustered_points.get(k).add(intensity);

                        // set the staining of the TMApoint
                        switch (k) {
                        case 0:
                            tp.setStaining(TMALabel.STAINING_0);
                            break;
                        case 1:
                            tp.setStaining(TMALabel.STAINING_1);
                            break;
                        case 2:
                            tp.setStaining(TMALabel.STAINING_2);
                            break;
                        default:
                            tp.setStaining(TMALabel.STAINING_3);
                            break;
                        }
                    }
                    ts.dispStainingInfo();
                    if (manager.getVisibleTMAspot() == ts) {
                        manager.repaintVisibleTMAspot();
                    }
                }

                // Write the description
                String description = "Nuclei clustered with " + getParam_AutomaticClustererString();
                if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                    description += " (" + getParam_HierarchicalClusteringMethod() + ")";
                }
                description += ", n=" + getParam_nClusters() + ", channel 2 intensity.";
                jLabel42.setText(description);
                jLabel41.setText(" ");

            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR));
        }
    }
}

From source file:intensityclustering.IntensityClustering.java

/**
 * Clusters the TMApoints on given TMAspots according to their staining
 * intensity (color). All parameters (e.g. clusterer and parameters) are
 * selected by the user. Features are simple color features.
 *
 * @param tss The TMAspots of which all nuclei (gold-standard and estimated)
 * are clustered according to color./* w w  w  .  j ava 2  s.c  om*/
 */
private void clusterPointsAutomaticallyColorSpace(List<TMAspot> tss) {
    if (tss.size() > 0) {
        try {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

            int n = getParam_nClusters();

            // Create ARFF Data
            FastVector atts;
            Instances data;
            int i;

            // 1. create arff data format
            atts = new FastVector(3);
            for (i = 0; i < 3; i++) {
                atts.addElement(new Attribute(Integer.toString(i)));
            }

            // 2. create Instances object
            data = new Instances("TMA points", atts, tmarker.getNumberNuclei(tss));

            // 3. fill with data
            BufferedImage img;
            Color c;
            float[] features = new float[3];
            String colorSpace = getParam_ColorSpace();
            for (TMAspot ts : tss) {
                img = ts.getBufferedImage();
                List<TMApoint> tps = ts.getPoints();
                for (TMApoint tp : tps) {
                    Color2Feature(TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false),
                            colorSpace, features);

                    // add the instance
                    Instance inst = new Instance(1.0, new double[] { features[0], features[1], features[2] });
                    inst.setDataset(data);
                    data.add(inst);
                }
            }

            // 4. set data class index (last attribute is the class)
            //data.setClassIndex(data.numAttributes() - 1); // not for weka 3.5.X
            if (tmarker.DEBUG > 4) {
                java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                        data.toString());
            }

            Clusterer clusterer = getClusterer();
            String[] options = getClustererOptions();
            if (false && colorSpace.equalsIgnoreCase("hsb")) {
                String[] newoptions = new String[options.length + 2];
                System.arraycopy(options, 0, newoptions, 0, options.length);
                newoptions[options.length] = "-A";
                newoptions[options.length + 1] = "weka.core.MyHSBDistance";
                options = newoptions;
            }

            if (tmarker.DEBUG > 3) {
                if (options.length > 0) {
                    String info = "Clusterer should have options\n";
                    for (String o : options) {
                        info += o + " ";
                    }
                    info += "\n";
                    java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                            info);
                }
            }

            clusterer.setOptions(options); // set the clusterer options
            clusterer.buildClusterer(data); // build the clusterer

            // order the clusters according to the brightness
            // The most bright cluster should be 0, then 1, then 2,...
            ArrayList<ArrayList<Double>> values = new ArrayList<>();
            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                values.add(new ArrayList<Double>());
            }
            int z;
            double value;
            for (i = 0; i < data.numInstances(); i++) {
                z = clusterer.clusterInstance(data.instance(i));
                value = getParam_ColorSpace().equalsIgnoreCase("hsb") ? data.instance(i).value(2)
                        : Misc.RGBToGray(data.instance(i).value(0), data.instance(i).value(1),
                                data.instance(i).value(2));
                values.get(z).add(value);
            }
            double[] means = new double[clusterer.numberOfClusters()];
            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                means[i] = Misc.mean(values.get(i).toArray(new Double[values.get(i).size()]));
            }
            int[] ordering = Misc.orderArray(means, !getParam_ColorSpace().equalsIgnoreCase("rtp"));

            String clusterInfo = "";
            for (String o : clusterer.getOptions()) {
                clusterInfo += o + " ";
            }
            clusterInfo += "\n\n";
            clusterInfo += clusterer.toString().trim();
            if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                try {
                    clusterInfo += ((HierarchicalClusterer) clusterer).graph();
                    HierarchyVisualizer a = new HierarchyVisualizer(
                            ((HierarchicalClusterer) clusterer).graph());
                    a.setSize(800, 600);
                    if (clusterVisualizer == null) {
                        clusterVisualizer = new JFrame("Hierarchical Clusterer Dendrogram");
                        clusterVisualizer.setIconImage(getIconImage());
                        clusterVisualizer.setDefaultCloseOperation(JFrame.DISPOSE_ON_CLOSE);
                        clusterVisualizer.setSize(800, 600);
                    }
                    Container contentPane = clusterVisualizer.getContentPane();
                    contentPane.removeAll();
                    contentPane.add(a);
                } catch (Exception e) {
                    clusterVisualizer = null;
                }
            }
            jTextArea1.setText(clusterInfo);

            if (tmarker.DEBUG > 3) {
                String info = "Clusterer has options\n";
                for (String o : clusterer.getOptions()) {
                    info += o + " ";
                }
                info += "\n";
                info += clusterer.toString() + "\n";
                // info += (clusterer).globalInfo() + "\n";
                info += "\n";
                info += clusterInfo + "\n";
                java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.INFO,
                        info);
            }

            // cluster all TMAspots and assign the corresponding class to them
            // Cluster the points
            List<List<Color>> clustered_points = new ArrayList<>();
            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                clustered_points.add(new ArrayList<Color>());
            }

            int k;
            for (TMAspot ts : tss) {
                img = ts.getBufferedImage();
                List<TMApoint> tps = ts.getPoints();
                for (TMApoint tp : tps) {
                    c = TMAspot.getAverageColorAtPoint(img, tp.x, tp.y, ts.getParam_r(), false);
                    Color2Feature(c, colorSpace, features);

                    // add the instance
                    Instance inst = new Instance(1.0, new double[] { features[0], features[1], features[2] });
                    inst.setDataset(data);
                    k = ordering[clusterer.clusterInstance(inst)];

                    // store the color for later visualization
                    clustered_points.get(k).add(c);

                    // set the staining of the TMApoint
                    switch (k) {
                    case 0:
                        tp.setStaining(TMALabel.STAINING_0);
                        break;
                    case 1:
                        tp.setStaining(TMALabel.STAINING_1);
                        break;
                    case 2:
                        tp.setStaining(TMALabel.STAINING_2);
                        break;
                    default:
                        tp.setStaining(TMALabel.STAINING_3);
                        break;
                    }
                }
                ts.dispStainingInfo();
                if (manager.getVisibleTMAspot() == ts) {
                    manager.repaintVisibleTMAspot();
                }
            }

            // draw the points
            Plot3DPanel plot;
            if (((java.awt.BorderLayout) (jPanel2.getLayout()))
                    .getLayoutComponent(java.awt.BorderLayout.CENTER) != null) {
                plot = (Plot3DPanel) ((java.awt.BorderLayout) (jPanel2.getLayout()))
                        .getLayoutComponent(java.awt.BorderLayout.CENTER);
                plot.removeAllPlots();
            } else {
                plot = new Plot3DPanel();
                plot.plotCanvas.setBackground(jPanel2.getBackground());
                plot.addLegend(PlotPanel.SOUTH);
                plot.plotLegend.setBackground(jPanel2.getBackground());
            }
            if (colorSpace.equalsIgnoreCase("hsb")) {
                plot.setAxisLabels("Hue", "Saturation", "Brightness");
            } else if (colorSpace.equalsIgnoreCase("rtp")) {
                plot.setAxisLabels("R", "Theta", "Phi");
            } else {
                plot.setAxisLabels("Red", "Green", "Blue");
            }

            for (i = 0; i < clusterer.numberOfClusters(); i++) {
                double[] xs = new double[clustered_points.get(i).size()];
                double[] ys = new double[clustered_points.get(i).size()];
                double[] zs = new double[clustered_points.get(i).size()];
                for (int j = 0; j < clustered_points.get(i).size(); j++) {
                    Color2Feature(clustered_points.get(i).get(j), colorSpace, features);
                    xs[j] = features[0];
                    ys[j] = features[1];
                    zs[j] = features[2];
                }
                if (xs.length > 0) {
                    c = getParam_ColorOfClassK(i);
                    plot.addScatterPlot("Staining " + i, c, xs, ys, zs);
                }
            }

            // Write the description
            String description = "Nuclei clustered with " + getParam_AutomaticClustererString();
            if (getParam_AutomaticClustererString().equalsIgnoreCase("Hierarchical")) {
                description += " (" + getParam_HierarchicalClusteringMethod() + ")";
            }
            description += ", n=" + getParam_nClusters() + ", color space " + getParam_ColorSpace() + ".";
            jLabel41.setText(description);
            jLabel42.setText(" ");

            if (((java.awt.BorderLayout) (jPanel2.getLayout()))
                    .getLayoutComponent(java.awt.BorderLayout.CENTER) == null) {
                jPanel2.add(plot, java.awt.BorderLayout.CENTER);
                validate();
                pack();
            }
        } catch (Exception | OutOfMemoryError e) {
            java.util.logging.Logger.getLogger(getClass().getName()).log(java.util.logging.Level.SEVERE, null,
                    e);
            JOptionPane.showMessageDialog(this,
                    "The clustering could not be performed.\n\n" + "A possible reasons is:\n"
                            + "- Not enough memory (too many points), \n\n"
                            + "You might want to try a different clustering method or less TMAspots.\n\n"
                            + "The error message is: \n" + e.getMessage(),
                    "Error at Nucleus clustering", JOptionPane.WARNING_MESSAGE);
        } finally {
            this.setCursor(Cursor.getPredefinedCursor(Cursor.DEFAULT_CURSOR));
        }
    }
}

From source file:jwebminer2.FeatureValueFileSaver.java

/**
 * Save the given text to the given location in the given format or
 * save the stored feature values, depending on the chosen_file_extension.
 * A progress bar is displayed (although not incremented).
 *
 * @param chosen_file_extension The file extension (corresponding to one
 *                              of the extensions published by the
 *                              getFileFormatExtension method) to use when
 *                              saving data_to_save, and the corresponding
 *                              file format.
 * @param data_to_save          The HTML code displayed on-screen. May be
 *                              null for non-HTML saving.
 * @param save_location         The file to save data_to_save to.
 * @throws Exception            Throws an Exception if the file cannot be
 *                              saved./*from  ww  w.ja va2 s  .  c o m*/
 */
public void saveContents(String chosen_file_extension, String data_to_save, File save_location)
        throws Exception {
    // Prepare the progress bar
    SimpleProgressBarDialog progress_bar = new SimpleProgressBarDialog(1, results_panel);

    // Write the whole contents of data_to_save verbatim as an HTML file
    // if an HTML file is to be saved
    if (chosen_file_extension.equals("HTML")) {
        DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                .getDataOutputStream(save_location);
        writer.writeBytes(data_to_save);
        writer.close();
    }

    // Only save the table of final feature values itself if a non-HTML
    // file format is to be saved
    else {
        // Access information to store
        double[][] feature_table = results_panel.feature_values;

        String[] column_labels = results_panel.column_labels;
        String[] row_labels = results_panel.row_labels;
        String[] orig_column_labels = column_labels;

        if (AnalysisProcessor.lastfm_enabled && AnalysisProcessor.is_cross_tabulation
                && (AnalysisProcessor.yahoo_application_id != null
                        || AnalysisProcessor.google_license_key != null)) {
            String[] column_labels_lastfm_websearch = new String[2 * column_labels.length];
            for (int i = 0; i < column_labels.length; i++) {
                column_labels_lastfm_websearch[i] = column_labels[i] + "_WS";
                column_labels_lastfm_websearch[i + column_labels.length] = column_labels[i] + "_LastFM";
            }
            column_labels = column_labels_lastfm_websearch;
        } else {
            column_labels = orig_column_labels;
        }

        // Save as tab delimited text file
        if (chosen_file_extension.equals("TXT")) {
            // Calculate the table to save
            String[][] results_table = new String[row_labels.length + 1][column_labels.length + 1];
            results_table[0][0] = "";
            for (int i = 0; i < results_table.length; i++) {
                for (int j = 0; j < results_table[i].length; j++) {
                    if (i == 0) {
                        if (j != 0)
                            results_table[i][j] = column_labels[j - 1];
                    } else {
                        if (j == 0)
                            results_table[i][j] = row_labels[i - 1];
                        else
                            results_table[i][j] = String.valueOf(feature_table[i - 1][j - 1]);
                    }
                }
            }

            // Save the table
            DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                    .getDataOutputStream(save_location);
            for (int i = 0; i < results_table.length; i++) {
                for (int j = 0; j < results_table[i].length; j++) {
                    // Write the table entry
                    writer.writeBytes(results_table[i][j]);

                    // Add a tab or a line break
                    if (j == results_table[i].length - 1)
                        writer.writeBytes("\n");
                    else
                        writer.writeBytes("\t");
                }
            }

            // Close the writing stream
            writer.close();
        }

        // Save as ACE XML file
        else if (chosen_file_extension.equals("ACE XML")) {
            // Set the name of the dataset to the name of the file
            // that is tob be saved
            String data_set_name = mckay.utilities.staticlibraries.StringMethods
                    .removeExtension(save_location.getName());

            // Prepare feature definitions and store feature names to
            // put in DataSets
            FeatureDefinition[] feature_definitions = new FeatureDefinition[column_labels.length];
            String[] feature_names = new String[column_labels.length];
            for (int feat = 0; feat < feature_definitions.length; feat++) {
                feature_definitions[feat] = new FeatureDefinition(column_labels[feat], "", false, 1);
                feature_names[feat] = column_labels[feat];
            }

            // Prepare the the DataSets to write
            DataSet[] data_sets = new DataSet[row_labels.length];
            for (int instance = 0; instance < data_sets.length; instance++) {
                // Instantiate the DataSet
                data_sets[instance] = new DataSet();

                // Store the instance names
                data_sets[instance].identifier = row_labels[instance];

                // Store the names of the features
                data_sets[instance].feature_names = feature_names;

                // Store the features for this DataSet as well as the
                // feature names
                double[][] these_feature_values = new double[feature_table[instance].length][1];
                for (int feat = 0; feat < these_feature_values.length; feat++)
                    these_feature_values[feat][0] = feature_table[instance][feat];
                data_sets[instance].feature_values = these_feature_values;

                // Validate, order and compact the DataSet
                data_sets[instance].orderAndCompactFeatures(feature_definitions, true);
            }

            // Save the feature values
            DataSet.saveDataSets(data_sets, feature_definitions, save_location,
                    "Features extracted with jWebMiner 2.0");
        }

        // Save as Weka ARFF file
        else if (chosen_file_extension.equals("Weka ARFF")) {
            // Set the name of the dataset to the name of the file
            // that is to be saved
            String data_set_name = mckay.utilities.staticlibraries.StringMethods
                    .removeExtension(save_location.getName());

            // Set the Attributes (feature names and class names)
            FastVector attributes_vector = new FastVector(column_labels.length + 1); // extra 1 is for class name
            for (int feat = 0; feat < column_labels.length; feat++)
                attributes_vector.addElement(new Attribute(column_labels[feat]));
            FastVector class_names_vector = new FastVector(column_labels.length);
            for (int cat = 0; cat < orig_column_labels.length; cat++)
                class_names_vector.addElement(orig_column_labels[cat]);
            attributes_vector.addElement(new Attribute("Class", class_names_vector));

            // Store attributes in an Instances object
            Instances instances = new Instances(data_set_name, attributes_vector, row_labels.length);
            instances.setClassIndex(instances.numAttributes() - 1);

            // Store the feature values and model classifications
            for (int inst = 0; inst < row_labels.length; inst++) {
                // Initialize an instance
                Instance this_instance = new Instance(instances.numAttributes());
                this_instance.setDataset(instances);
                int current_attribute = 0;

                // Set feature values for the instance
                for (int feat = 0; feat < column_labels.length; feat++)
                    this_instance.setValue(feat, feature_table[inst][feat]);

                // Set the class value for the instance
                // this_instance.setClassValue("a");
                instances.setRelationName("jWebMiner2");

                // Add this instance to instances
                instances.add(this_instance);
            }

            // Prepare the buffer to save to and add comments indicating
            // the names of the rows
            DataOutputStream writer = mckay.utilities.staticlibraries.FileMethods
                    .getDataOutputStream(save_location);
            writer.writeBytes("% INSTANCES (DATA ROWS) BELOW CORRESPOND TO:\n%\n");
            for (int inst = 0; inst < row_labels.length; inst++)
                writer.writeBytes("%    " + (inst + 1) + ") " + row_labels[inst] + "\n");
            writer.writeBytes("%\n");

            // Save the ARFF file
            ArffSaver arff_saver = new ArffSaver();
            arff_saver.setInstances(instances);
            arff_saver.setFile(save_location);
            arff_saver.setDestination(writer);
            try {
                arff_saver.writeBatch();
            } catch (Exception e) {
                throw new Exception(
                        "File only partially saved.\n\nTry resaving the file with a .arff extension.");
            }

            // Close the writer
            writer.close();
        }
    }

    // Terminate the progress bar
    progress_bar.done();
}

From source file:kea.KEAFilter.java

License:Open Source License

/**
 * Converts an instance./*w  w w . j a  v a 2s. c o  m*/
 */
private FastVector convertInstance(Instance instance, boolean training) throws Exception {

    FastVector vector = new FastVector();

    if (m_Debug) {
        System.err.println("-- Converting instance");
    }

    // Get the key phrases for the document
    HashMap hashKeyphrases = null;
    HashMap hashKeysEval = null;
    if (!instance.isMissing(m_KeyphrasesAtt)) {
        String keyphrases = instance.stringValue(m_KeyphrasesAtt);
        hashKeyphrases = getGivenKeyphrases(keyphrases, false);
        hashKeysEval = getGivenKeyphrases(keyphrases, true);
    }

    // Get the phrases for the document
    HashMap hash = new HashMap();
    int length = getPhrases(hash, instance.stringValue(m_DocumentAtt));

    // Compute number of extra attributes
    int numFeatures = 5;
    if (m_Debug) {
        if (m_KFused) {
            numFeatures = numFeatures + 1;
        }
    }

    // Set indices of key attributes
    int phraseAttIndex = m_DocumentAtt;
    int tfidfAttIndex = m_DocumentAtt + 2;
    int distAttIndex = m_DocumentAtt + 3;
    int probsAttIndex = m_DocumentAtt + numFeatures - 1;

    // Go through the phrases and convert them into instances
    Iterator it = hash.keySet().iterator();
    while (it.hasNext()) {
        String phrase = (String) it.next();
        FastVector phraseInfo = (FastVector) hash.get(phrase);
        double[] vals = featVals(phrase, phraseInfo, training, hashKeysEval, hashKeyphrases, length);
        Instance inst = new Instance(instance.weight(), vals);
        inst.setDataset(m_ClassifierData);

        // Get probability of phrase being key phrase
        double[] probs = m_Classifier.distributionForInstance(inst);
        double prob = probs[1];

        // Compute attribute values for final instance
        double[] newInst = new double[instance.numAttributes() + numFeatures];
        int pos = 0;
        for (int i = 0; i < instance.numAttributes(); i++) {
            if (i == m_DocumentAtt) {

                // Add phrase
                int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                newInst[pos++] = index;

                // Add original version
                index = outputFormatPeek().attribute(pos).addStringValue((String) phraseInfo.elementAt(2));
                newInst[pos++] = index;

                // Add TFxIDF
                newInst[pos++] = inst.value(m_TfidfIndex);

                // Add distance
                newInst[pos++] = inst.value(m_FirstOccurIndex);

                // Add other features
                if (m_Debug) {
                    if (m_KFused) {
                        newInst[pos++] = inst.value(m_KeyFreqIndex);
                    }
                }

                // Add probability 
                probsAttIndex = pos;
                newInst[pos++] = prob;

                // Set rank to missing (computed below)
                newInst[pos++] = Instance.missingValue();
            } else if (i == m_KeyphrasesAtt) {
                newInst[pos++] = inst.classValue();
            } else {
                newInst[pos++] = instance.value(i);
            }
        }
        Instance ins = new Instance(instance.weight(), newInst);
        ins.setDataset(outputFormatPeek());
        vector.addElement(ins);
    }

    // Add dummy instances for keyphrases that don't occur
    // in the document
    if (hashKeysEval != null) {
        Iterator phrases = hashKeysEval.keySet().iterator();
        while (phrases.hasNext()) {
            String phrase = (String) phrases.next();
            double[] newInst = new double[instance.numAttributes() + numFeatures];
            int pos = 0;
            for (int i = 0; i < instance.numAttributes(); i++) {
                if (i == m_DocumentAtt) {

                    // Add phrase
                    int index = outputFormatPeek().attribute(pos).addStringValue(phrase);
                    newInst[pos++] = (double) index;

                    // Add original version
                    index = outputFormatPeek().attribute(pos).addStringValue((String) hashKeysEval.get(phrase));
                    newInst[pos++] = (double) index;

                    // Add TFxIDF
                    newInst[pos++] = Instance.missingValue();

                    // Add distance
                    newInst[pos++] = Instance.missingValue();

                    // Add other features
                    if (m_Debug) {
                        if (m_KFused) {
                            newInst[pos++] = Instance.missingValue();
                        }
                    }

                    // Add probability and rank
                    newInst[pos++] = -Double.MAX_VALUE;
                    newInst[pos++] = Instance.missingValue();
                } else if (i == m_KeyphrasesAtt) {
                    newInst[pos++] = 1; // Keyphrase
                } else {
                    newInst[pos++] = instance.value(i);
                }
            }
            Instance inst = new Instance(instance.weight(), newInst);
            inst.setDataset(outputFormatPeek());
            vector.addElement(inst);
        }
    }

    // Sort phrases according to their distance (stable sort)
    double[] vals = new double[vector.size()];
    for (int i = 0; i < vals.length; i++) {
        vals[i] = ((Instance) vector.elementAt(i)).value(distAttIndex);
    }
    FastVector newVector = new FastVector(vector.size());
    int[] sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their tfxidf value (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = -((Instance) vector.elementAt(i)).value(tfidfAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Sort phrases according to their probability (stable sort)
    for (int i = 0; i < vals.length; i++) {
        vals[i] = 1 - ((Instance) vector.elementAt(i)).value(probsAttIndex);
    }
    newVector = new FastVector(vector.size());
    sortedIndices = Utils.stableSort(vals);
    for (int i = 0; i < vals.length; i++) {
        newVector.addElement(vector.elementAt(sortedIndices[i]));
    }
    vector = newVector;

    // Compute rank of phrases. Check for subphrases that are ranked
    // lower than superphrases and assign probability -1 and set the
    // rank to Integer.MAX_VALUE
    int rank = 1;
    for (int i = 0; i < vals.length; i++) {
        Instance currentInstance = (Instance) vector.elementAt(i);

        // Short cut: if phrase very unlikely make rank very low and continue
        if (Utils.grOrEq(vals[i], 1.0)) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
            continue;
        }

        // Otherwise look for super phrase starting with first phrase
        // in list that has same probability, TFxIDF value, and distance as
        // current phrase. We do this to catch all superphrases
        // that have same probability, TFxIDF value and distance as current phrase.
        int startInd = i;
        while (startInd < vals.length) {
            Instance inst = (Instance) vector.elementAt(startInd);
            if ((inst.value(tfidfAttIndex) != currentInstance.value(tfidfAttIndex))
                    || (inst.value(probsAttIndex) != currentInstance.value(probsAttIndex))
                    || (inst.value(distAttIndex) != currentInstance.value(distAttIndex))) {
                break;
            }
            startInd++;
        }
        String val = currentInstance.stringValue(phraseAttIndex);
        boolean foundSuperphrase = false;
        for (int j = startInd - 1; j >= 0; j--) {
            if (j != i) {
                Instance candidate = (Instance) vector.elementAt(j);
                String potSuperphrase = candidate.stringValue(phraseAttIndex);
                if (val.length() <= potSuperphrase.length()) {
                    if (KEAFilter.contains(val, potSuperphrase)) {
                        foundSuperphrase = true;
                        break;
                    }
                }
            }
        }
        if (foundSuperphrase) {
            currentInstance.setValue(probsAttIndex + 1, Integer.MAX_VALUE);
        } else {
            currentInstance.setValue(probsAttIndex + 1, rank++);
        }
    }
    return vector;
}

From source file:kea.KEAPhraseFilter.java

License:Open Source License

/** 
 * Converts an instance by removing all non-alphanumeric characters
 * from its string attribute values./* www .  j a  va 2s .  co m*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if (!instance.attribute(i).isString() || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            if (!m_SelectCols.isInRange(i)) {
                int index = getOutputFormat().attribute(i).addStringValue(instance.stringValue(i));
                instVals[i] = (double) index;
                continue;
            }
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            int j = 0;
            boolean phraseStart = true;
            boolean seenNewLine = false;
            boolean haveSeenHyphen = false;
            boolean haveSeenSlash = false;
            while (j < str.length()) {
                boolean isWord = false;
                boolean potNumber = false;
                int startj = j;
                while (j < str.length()) {
                    char ch = str.charAt(j);
                    if (Character.isLetterOrDigit(ch)) {
                        potNumber = true;
                        if (Character.isLetter(ch)) {
                            isWord = true;
                        }
                        j++;
                    } else if ((!m_DisallowInternalPeriods && (ch == '.')) || (ch == '@') || (ch == '_')
                            || (ch == '&') || (ch == '/') || (ch == '-')) {
                        if ((j > 0) && (j + 1 < str.length()) && Character.isLetterOrDigit(str.charAt(j - 1))
                                && Character.isLetterOrDigit(str.charAt(j + 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else if (ch == '\'') {
                        if ((j > 0) && Character.isLetterOrDigit(str.charAt(j - 1))) {
                            j++;
                        } else {
                            break;
                        }
                    } else {
                        break;
                    }
                }
                if (isWord == true) {
                    if (!phraseStart) {
                        if (haveSeenHyphen) {
                            resultStr.append('-');
                        } else if (haveSeenSlash) {
                            resultStr.append('/');
                        } else {
                            resultStr.append(' ');
                        }
                    }
                    resultStr.append(str.substring(startj, j));
                    if (j == str.length()) {
                        break;
                    }
                    phraseStart = false;
                    seenNewLine = false;
                    haveSeenHyphen = false;
                    haveSeenSlash = false;
                    if (Character.isWhitespace(str.charAt(j))) {
                        if (str.charAt(j) == '\n') {
                            seenNewLine = true;
                        }
                    } else if (str.charAt(j) == '-') {
                        haveSeenHyphen = true;
                    } else if (str.charAt(j) == '/') {
                        haveSeenSlash = true;
                    } else {
                        phraseStart = true;
                        resultStr.append('\n');
                    }
                    j++;
                } else if (j == str.length()) {
                    break;
                } else if (str.charAt(j) == '\n') {
                    if (seenNewLine) {
                        if (phraseStart == false) {
                            resultStr.append('\n');
                            phraseStart = true;
                        }
                    } else if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    seenNewLine = true;
                    j++;
                } else if (Character.isWhitespace(str.charAt(j))) {
                    if (potNumber) {
                        if (phraseStart == false) {
                            phraseStart = true;
                            resultStr.append('\n');
                        }
                    }
                    j++;
                } else {
                    if (phraseStart == false) {
                        resultStr.append('\n');
                        phraseStart = true;
                    }
                    j++;
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}

From source file:kea.NumbersFilter.java

License:Open Source License

/** 
 * Converts an instance. A phrase boundary is inserted where
 * a number is found./*from  ww w.  j a va 2  s. c  om*/
 */
private void convertInstance(Instance instance) throws Exception {

    double[] instVals = new double[instance.numAttributes()];

    for (int i = 0; i < instance.numAttributes(); i++) {
        if ((!instance.attribute(i).isString()) || instance.isMissing(i)) {
            instVals[i] = instance.value(i);
        } else {
            String str = instance.stringValue(i);
            StringBuffer resultStr = new StringBuffer();
            StringTokenizer tok = new StringTokenizer(str, " \t\n", true);
            while (tok.hasMoreTokens()) {
                String token = tok.nextToken();

                // Everything that doesn't contain at least
                // one letter is considered to be a number
                boolean isNumber = true;
                for (int j = 0; j < token.length(); j++) {
                    if (Character.isLetter(token.charAt(j))) {
                        isNumber = false;
                        break;
                    }
                }
                if (!isNumber) {
                    resultStr.append(token);
                } else {
                    if (token.equals(" ") || token.equals("\t") || token.equals("\n")) {
                        resultStr.append(token);
                    } else {
                        resultStr.append(" \n ");
                    }
                }
            }
            int index = getOutputFormat().attribute(i).addStringValue(resultStr.toString());
            instVals[i] = (double) index;
        }
    }
    Instance inst = new Instance(instance.weight(), instVals);
    inst.setDataset(getOutputFormat());
    push(inst);
}