Example usage for edu.stanford.nlp.objectbank ObjectBank getLineIterator

Introduction

In this page you can find the example usage for edu.stanford.nlp.objectbank ObjectBank getLineIterator.

Prototype

public static ObjectBank<String> getLineIterator(Collection<?> filesStringsAndReaders, String encoding)

Source Link

Usage

From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java

License:Open Source License

private void initLexicon(SeqClassifierFlags flags) {
    if (flags.distSimLexicon == null) {
        return;/*www  . java  2 s. c  om*/
    }
    if (lexicon != null) {
        return;
    }
    Timing.startDoing("Loading distsim lexicon from " + flags.distSimLexicon);
    lexicon = Generics.newHashMap();
    boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat);
    for (String line : ObjectBank.getLineIterator(flags.distSimLexicon, flags.inputEncoding)) {
        String word;
        String wordClass;
        if (terryKoo) {
            String[] bits = line.split("\\t");
            word = bits[1];
            wordClass = bits[0];
            if (flags.distSimMaxBits > 0 && wordClass.length() > flags.distSimMaxBits) {
                wordClass = wordClass.substring(0, flags.distSimMaxBits);
            }
        } else {
            // "alexClark"
            String[] bits = line.split("\\s+");
            word = bits[0];
            wordClass = bits[1];
        }
        if (!flags.casedDistSim) {
            word = word.toLowerCase();
        }
        if (flags.numberEquivalenceDistSim) {
            word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS);
        }
        lexicon.put(word, wordClass);
    }
    Timing.endDoing();
}

From source file:my.demo.DemoUI.java

License:Open Source License

private void runMyClassifier() {
    //This funtion is used to identify proteins and genes from a given
    //random abstract by using the stanfor classifier

    try {/*from   w w  w.  j  a v a2  s  .c  o m*/
        int count = 1;
        int position = 0;

        Scanner sc2 = null;

        try {
            sc2 = new Scanner(new File(Variables.filePathTxt));
            logIt("Reading The abstract...");
            // to get start and end indices

        } catch (FileNotFoundException e1) {
            // TODO Auto-generated catch block!!
            JOptionPane.showMessageDialog(null, "ERROR!!!File Not Found!!");
            //e1.printStackTrace();
        }

        File file = new File(Variables.bratPath + Variables.fileName + ".test");

        logIt("Processing The abstract...");

        FileWriter fw = new FileWriter(file.getAbsoluteFile());
        BufferedWriter bw = new BufferedWriter(fw);

        while (sc2.hasNext()) {
            String s = sc2.next();
            bw.write("-" + "\t" + s + "\n");
        }

        bw.close();

        file = new File(Variables.bratPath + Variables.fileName + ".ann");
        logIt("Running the Classifier...");

        fw = new FileWriter(file.getAbsoluteFile());
        bw = new BufferedWriter(fw);

        ColumnDataClassifier cdc = new ColumnDataClassifier(Variables.filePathProp);
        Classifier<String, String> cl = cdc.makeClassifier(cdc.readTrainingExamples(Variables.filePathTrain));
        for (String line : ObjectBank.getLineIterator(Variables.bratPath + Variables.fileName + ".test",
                "utf-8")) {

            Datum<String, String> d = cdc.makeDatumFromLine(line);
            System.out.println(line + "  ==>  " + cl.classOf(d) + "--");
            String myclass = "";
            if (!cl.classOf(d).equals("Others")) {
                switch (cl.classOf(d)) {
                case "gene":
                    myclass = "Gene-Level";
                    Variables.geneCount++;
                    break;
                case "protein":
                    myclass = "Protein-Level";
                    Variables.proteinCount++;
                    break;
                }
                //data for visualisation
                Variables.geneScores.add(cl.scoresOf(d).getCount("gene"));
                Variables.proteinScores.add(cl.scoresOf(d).getCount("protein"));
                Variables.keyWords.add(line.substring(line.indexOf("\t") + 1));

                logIt("Creating Annotations for Brat...");
                bw.write("T" + (count++) + "\t" + myclass + " " + position + " "
                        + (position + line.substring(line.indexOf("\t") + 1).length()) + "\t"
                        + line.substring(line.indexOf("\t") + 1) + "\n");

            } else {
                Variables.otherCount++;
            }

            position += (line.substring(line.indexOf("\t") + 1).length() + 1);

        }

        bw.close();
        System.out.println(Variables.geneScores);
        //Copying .txt file to brat folder
        File source = new File(Variables.filePathTxt);
        File dest = new File(Variables.bratPath + Variables.fileName + ".txt");

        //copy file conventional way using Stream
        copyFileUsingStream(source, dest);

        logIt("Opening Browser...");

        //Open the annotations in the brat server
        Desktop d = Desktop.getDesktop();
        d.browse(new URI(Variables.bratURL + Variables.fileName));

    } catch (HeadlessException | IOException e) {
        JOptionPane.showMessageDialog(null, "ERROR!!!Some error has occured");

    } catch (URISyntaxException ex) {
        Logger.getLogger(DemoUI.class.getName()).log(Level.SEVERE, null, ex);
    }

}

From source file:my.demo.DemoUI.java

License:Open Source License

private void step2Classify() {
    //This function finds out the given set of keywords are responsible for which disease
    Variables.leukemiaCount = 0;/*  w ww  .j a  va 2 s  .  c o m*/
    Variables.gliomaCount = 0;
    Variables.breastCancerCount = 0;
    Variables.pancreaticCancerCount = 0;
    ColumnDataClassifier cdc = new ColumnDataClassifier(Variables.filePathProp2);
    Classifier<String, String> cl;
    if (!Variables.multiVariate) {

        cl = cdc.makeClassifier(cdc.readTrainingExamples(Variables.filePathTrain2));
    } else {
        cl = cdc.makeClassifier(cdc.readTrainingExamples(Variables.filePathTrain2multi));
    }
    double threshold1, threshold2;
    if (!Variables.multiVariate) {
        threshold1 = Variables.thresholdIndividual;
        threshold2 = Variables.thresholdOutlier;
    } else {
        threshold1 = Variables.thresholdIndividualmulti;
        threshold2 = Variables.thresholdOutliermulti;
    }
    for (String line : ObjectBank.getLineIterator(Variables.bratPath + Variables.fileNameTest2, "utf-8")) {
        /*
         Check every keyword and calculte the score it gives towards every disease.
         The score is considered to be responsible only if it has a score of more than 2
         All such responsible scores are added and at the end the disease with the maximum score value
         is the predicted disease
         */
        Datum<String, String> d = cdc.makeDatumFromLine(line);
        System.out.println(line + "  ==>  " + cl.classOf(d) + "==" + cl.scoresOf(d));

        switch (cl.classOf(d)) {
        case "leukemia":
            if (cl.scoresOf(d).getCount("leukemia") >= threshold1) {
                Variables.leukemiaCount += cl.scoresOf(d).getCount("leukemia");
                System.out.println("Adding");
            }
            break;
        case "breast-cancer":
            if (cl.scoresOf(d).getCount("breast-cancer") >= threshold1) {
                Variables.breastCancerCount += cl.scoresOf(d).getCount("breast-cancer");
            }
            break;
        case "glioma":
            if (cl.scoresOf(d).getCount("glioma") >= threshold1) {
                Variables.gliomaCount += cl.scoresOf(d).getCount("glioma");
            }
            break;
        case "pancreatic-cancer":
            if (cl.scoresOf(d).getCount("pancreatic-cancer") >= threshold1) {
                Variables.pancreaticCancerCount += cl.scoresOf(d).getCount("pancreatic-cancer");
            }
            break;
        }
    }

    logIt("Calculating Scores for Disease Identification...");
    /*
     The concept of outlier is tested on the basis that it cannot surely
     predict the disease and hence none of the scores go beyond the total 6
     */

    double max = Math.max(Math.max(Variables.gliomaCount, Variables.pancreaticCancerCount),
            Math.max(Variables.leukemiaCount, Variables.breastCancerCount));
    if (max >= threshold2) {
        if (max == Variables.pancreaticCancerCount) {

            JOptionPane.showMessageDialog(null, "PANCREATIC CANCER!!!");
        }
        if (max == Variables.gliomaCount) {
            JOptionPane.showMessageDialog(null, "GLIOMA!!!");
        }
        if (max == Variables.leukemiaCount) {
            JOptionPane.showMessageDialog(null, "LEUKEMIA!!!");
        }
        if (max == Variables.breastCancerCount) {
            JOptionPane.showMessageDialog(null, "BREAST CANCER!!!");
        }
    } else {
        JOptionPane.showMessageDialog(null, "Outlier!!");
    }

    JOptionPane.showMessageDialog(null,
            "Scores!!\nPC:" + Variables.pancreaticCancerCount + "\nBC:" + Variables.breastCancerCount
                    + "\nGlioma:" + Variables.gliomaCount + "\nLeukemia:" + Variables.leukemiaCount);
    logIt("Finish!!");
}