List of usage examples for edu.stanford.nlp.objectbank ObjectBank getLineIterator
public static ObjectBank<String> getLineIterator(Collection<?> filesStringsAndReaders, String encoding)
From source file:de.iisys.ocr.pos.CustomNERFeatureFactory.java
License:Open Source License
private void initLexicon(SeqClassifierFlags flags) { if (flags.distSimLexicon == null) { return;/*www . java 2 s. c om*/ } if (lexicon != null) { return; } Timing.startDoing("Loading distsim lexicon from " + flags.distSimLexicon); lexicon = Generics.newHashMap(); boolean terryKoo = "terryKoo".equals(flags.distSimFileFormat); for (String line : ObjectBank.getLineIterator(flags.distSimLexicon, flags.inputEncoding)) { String word; String wordClass; if (terryKoo) { String[] bits = line.split("\\t"); word = bits[1]; wordClass = bits[0]; if (flags.distSimMaxBits > 0 && wordClass.length() > flags.distSimMaxBits) { wordClass = wordClass.substring(0, flags.distSimMaxBits); } } else { // "alexClark" String[] bits = line.split("\\s+"); word = bits[0]; wordClass = bits[1]; } if (!flags.casedDistSim) { word = word.toLowerCase(); } if (flags.numberEquivalenceDistSim) { word = WordShapeClassifier.wordShape(word, WordShapeClassifier.WORDSHAPEDIGITS); } lexicon.put(word, wordClass); } Timing.endDoing(); }
From source file:my.demo.DemoUI.java
License:Open Source License
private void runMyClassifier() { //This funtion is used to identify proteins and genes from a given //random abstract by using the stanfor classifier try {/*from w w w. j a v a2 s .c o m*/ int count = 1; int position = 0; Scanner sc2 = null; try { sc2 = new Scanner(new File(Variables.filePathTxt)); logIt("Reading The abstract..."); // to get start and end indices } catch (FileNotFoundException e1) { // TODO Auto-generated catch block!! JOptionPane.showMessageDialog(null, "ERROR!!!File Not Found!!"); //e1.printStackTrace(); } File file = new File(Variables.bratPath + Variables.fileName + ".test"); logIt("Processing The abstract..."); FileWriter fw = new FileWriter(file.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw); while (sc2.hasNext()) { String s = sc2.next(); bw.write("-" + "\t" + s + "\n"); } bw.close(); file = new File(Variables.bratPath + Variables.fileName + ".ann"); logIt("Running the Classifier..."); fw = new FileWriter(file.getAbsoluteFile()); bw = new BufferedWriter(fw); ColumnDataClassifier cdc = new ColumnDataClassifier(Variables.filePathProp); Classifier<String, String> cl = cdc.makeClassifier(cdc.readTrainingExamples(Variables.filePathTrain)); for (String line : ObjectBank.getLineIterator(Variables.bratPath + Variables.fileName + ".test", "utf-8")) { Datum<String, String> d = cdc.makeDatumFromLine(line); System.out.println(line + " ==> " + cl.classOf(d) + "--"); String myclass = ""; if (!cl.classOf(d).equals("Others")) { switch (cl.classOf(d)) { case "gene": myclass = "Gene-Level"; Variables.geneCount++; break; case "protein": myclass = "Protein-Level"; Variables.proteinCount++; break; } //data for visualisation Variables.geneScores.add(cl.scoresOf(d).getCount("gene")); Variables.proteinScores.add(cl.scoresOf(d).getCount("protein")); Variables.keyWords.add(line.substring(line.indexOf("\t") + 1)); logIt("Creating Annotations for Brat..."); bw.write("T" + (count++) + "\t" + myclass + " " + position + " " + (position + line.substring(line.indexOf("\t") + 1).length()) + "\t" + line.substring(line.indexOf("\t") + 1) + "\n"); } else { Variables.otherCount++; } position += (line.substring(line.indexOf("\t") + 1).length() + 1); } bw.close(); System.out.println(Variables.geneScores); //Copying .txt file to brat folder File source = new File(Variables.filePathTxt); File dest = new File(Variables.bratPath + Variables.fileName + ".txt"); //copy file conventional way using Stream copyFileUsingStream(source, dest); logIt("Opening Browser..."); //Open the annotations in the brat server Desktop d = Desktop.getDesktop(); d.browse(new URI(Variables.bratURL + Variables.fileName)); } catch (HeadlessException | IOException e) { JOptionPane.showMessageDialog(null, "ERROR!!!Some error has occured"); } catch (URISyntaxException ex) { Logger.getLogger(DemoUI.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:my.demo.DemoUI.java
License:Open Source License
private void step2Classify() { //This function finds out the given set of keywords are responsible for which disease Variables.leukemiaCount = 0;/* w ww .j a va 2 s . c o m*/ Variables.gliomaCount = 0; Variables.breastCancerCount = 0; Variables.pancreaticCancerCount = 0; ColumnDataClassifier cdc = new ColumnDataClassifier(Variables.filePathProp2); Classifier<String, String> cl; if (!Variables.multiVariate) { cl = cdc.makeClassifier(cdc.readTrainingExamples(Variables.filePathTrain2)); } else { cl = cdc.makeClassifier(cdc.readTrainingExamples(Variables.filePathTrain2multi)); } double threshold1, threshold2; if (!Variables.multiVariate) { threshold1 = Variables.thresholdIndividual; threshold2 = Variables.thresholdOutlier; } else { threshold1 = Variables.thresholdIndividualmulti; threshold2 = Variables.thresholdOutliermulti; } for (String line : ObjectBank.getLineIterator(Variables.bratPath + Variables.fileNameTest2, "utf-8")) { /* Check every keyword and calculte the score it gives towards every disease. The score is considered to be responsible only if it has a score of more than 2 All such responsible scores are added and at the end the disease with the maximum score value is the predicted disease */ Datum<String, String> d = cdc.makeDatumFromLine(line); System.out.println(line + " ==> " + cl.classOf(d) + "==" + cl.scoresOf(d)); switch (cl.classOf(d)) { case "leukemia": if (cl.scoresOf(d).getCount("leukemia") >= threshold1) { Variables.leukemiaCount += cl.scoresOf(d).getCount("leukemia"); System.out.println("Adding"); } break; case "breast-cancer": if (cl.scoresOf(d).getCount("breast-cancer") >= threshold1) { Variables.breastCancerCount += cl.scoresOf(d).getCount("breast-cancer"); } break; case "glioma": if (cl.scoresOf(d).getCount("glioma") >= threshold1) { Variables.gliomaCount += cl.scoresOf(d).getCount("glioma"); } break; case "pancreatic-cancer": if (cl.scoresOf(d).getCount("pancreatic-cancer") >= threshold1) { Variables.pancreaticCancerCount += cl.scoresOf(d).getCount("pancreatic-cancer"); } break; } } logIt("Calculating Scores for Disease Identification..."); /* The concept of outlier is tested on the basis that it cannot surely predict the disease and hence none of the scores go beyond the total 6 */ double max = Math.max(Math.max(Variables.gliomaCount, Variables.pancreaticCancerCount), Math.max(Variables.leukemiaCount, Variables.breastCancerCount)); if (max >= threshold2) { if (max == Variables.pancreaticCancerCount) { JOptionPane.showMessageDialog(null, "PANCREATIC CANCER!!!"); } if (max == Variables.gliomaCount) { JOptionPane.showMessageDialog(null, "GLIOMA!!!"); } if (max == Variables.leukemiaCount) { JOptionPane.showMessageDialog(null, "LEUKEMIA!!!"); } if (max == Variables.breastCancerCount) { JOptionPane.showMessageDialog(null, "BREAST CANCER!!!"); } } else { JOptionPane.showMessageDialog(null, "Outlier!!"); } JOptionPane.showMessageDialog(null, "Scores!!\nPC:" + Variables.pancreaticCancerCount + "\nBC:" + Variables.breastCancerCount + "\nGlioma:" + Variables.gliomaCount + "\nLeukemia:" + Variables.leukemiaCount); logIt("Finish!!"); }