List of usage examples for weka.core Instances deleteAttributeType
public void deleteAttributeType(int attType)
From source file:sirius.trainer.step3.SelectFeaturePane.java
License:Open Source License
protected void applyFilter(final Filter filter) { if (applicationData.getOneThread() == null) { applicationData.setOneThread(new Thread() { public void run() { try { if (filter != null) { // String cmd = filter.getClass().getName(); // if(filter instanceof OptionHandler) // cmd += " " + Utils.joinOptions(((OptionHandler) filter).getOptions()); /*comment away for the time being int classIndex = m_AttVisualizePanel.getColoringIndex(); if ((classIndex < 0) && (filter instanceof SupervisedFilter)) { throw new IllegalArgumentException("Class (colour) needs to " + "be set for supervised " + "filter."); }*//*w w w .java 2 s .co m*/ Instances copy = new Instances(applicationData.getDataset1Instances()); //copy.setClassIndex(classIndex); copy.setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1); copy.deleteAttributeType(Attribute.STRING); filter.setInputFormat(copy); statusPane.setText("Applying Filter.. May take a while.. Please wait.."); Instances newInstances = Filter.useFilter(copy, filter); if (newInstances == null || newInstances.numAttributes() < 1) { throw new Exception("Dataset is empty."); } addUndoPoint(); //m_AttVisualizePanel.setColoringIndex(copy.classIndex()); // if class was not set before, reset it again after use of filter if (applicationData.getDataset1Instances().classIndex() < 0) newInstances.setClassIndex(-1); //dataset1Instances = newInstances; setDataset1Instances(newInstances); statusPane.setText("Filter Applied.."); } } catch (Exception ex) { // Pop up an error optionpane ex.printStackTrace(); JOptionPane.showMessageDialog(parent, "Problem filtering instances:\n" + ex.getMessage(), "Apply Filter", JOptionPane.ERROR_MESSAGE); } applicationData.setOneThread(null); } }); applicationData.getOneThread().setPriority(Thread.MIN_PRIORITY); // UI has most priority applicationData.getOneThread().start(); } else { JOptionPane.showMessageDialog(parent, "Can't apply filter at this time,\n" + "currently busy with other IO", "Apply Filter", JOptionPane.WARNING_MESSAGE); } }
From source file:sirius.trainer.step4.DatasetGenerator.java
License:Open Source License
public static boolean generateDataset2(JInternalFrame parent, ApplicationData applicationData, int classifierTwoUpstream, int classifierTwoDownstream, Classifier classifierOne) { try {/*from w w w. ja va 2 s. c om*/ StatusPane statusPane = applicationData.getStatusPane(); int positiveDataset2FromInt = applicationData.getPositiveDataset2FromField(); int positiveDataset2ToInt = applicationData.getPositiveDataset2ToField(); int negativeDataset2FromInt = applicationData.getNegativeDataset2FromField(); int negativeDataset2ToInt = applicationData.getNegativeDataset2ToField(); int totalDataset2PositiveInstances = positiveDataset2ToInt - positiveDataset2FromInt + 1; int totalDataset2NegativeInstances = negativeDataset2ToInt - negativeDataset2FromInt + 1; int totalDataset2Instances = totalDataset2PositiveInstances + totalDataset2NegativeInstances; int scoringMatrixIndex = applicationData.getScoringMatrixIndex(); int countingStyleIndex = applicationData.getCountingStyleIndex(); //Generate the header for Dataset2.arff BufferedWriter dataset2OutputFile = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "Dataset2.arff")); dataset2OutputFile.write("@relation 'Dataset2.arff' "); dataset2OutputFile.newLine(); dataset2OutputFile.newLine(); dataset2OutputFile.flush(); for (int x = classifierTwoUpstream; x <= classifierTwoDownstream; x++) { if (x != 0) {//This statment is used because in sequence position only -1,+1 dun have 0 dataset2OutputFile.write("@attribute (" + x + ") numeric"); dataset2OutputFile.newLine(); dataset2OutputFile.flush(); } } if (positiveDataset2FromInt > 0 && negativeDataset2FromInt > 0) dataset2OutputFile.write("@attribute Class {pos,neg}"); else if (positiveDataset2FromInt > 0 && negativeDataset2FromInt == 0) dataset2OutputFile.write("@attribute Class {pos}"); else if (positiveDataset2FromInt == 0 && negativeDataset2FromInt > 0) dataset2OutputFile.write("@attribute Class {neg}"); dataset2OutputFile.newLine(); dataset2OutputFile.newLine(); dataset2OutputFile.write("@data"); dataset2OutputFile.newLine(); dataset2OutputFile.newLine(); dataset2OutputFile.flush(); //Generating an Instance given a sequence with the current attributes //for dataset2.arff //Need this for parameter setting for tempInst Instances inst = applicationData.getDataset1Instances(); inst.deleteAttributeType(Attribute.STRING); FastaFileManipulation fastaFile = new FastaFileManipulation( applicationData.getPositiveStep1TableModel(), applicationData.getNegativeStep1TableModel(), positiveDataset2FromInt, positiveDataset2ToInt, negativeDataset2FromInt, negativeDataset2ToInt, applicationData.getWorkingDirectory()); //Reading and Storing the featureList ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute must be ignored featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name())); } //Reading the fastaFile int lineCounter = 0; String _class = "pos"; FastaFormat fastaFormat; while ((fastaFormat = fastaFile.nextSequence(_class)) != null) { if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier Two Training Not Complete"); dataset2OutputFile.close(); return false; } lineCounter++;//Putting it here will mean if lineCounter is x then line == sequence x //if((lineCounter % 100) == 0){ dataset2OutputFile.flush(); statusPane.setText("Generating Dataset2.arff.. @ " + lineCounter + " / " + totalDataset2Instances + " Sequences"); //} //For each sequence, you want to shift from upstream till downstream //ie changing the +1 location //to get the scores given by classifier one so that you can use it to train classifier two later //Doing shift from upstream till downstream SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), classifierTwoUpstream, classifierTwoDownstream); String line2; while ((line2 = seq.nextShift()) != null) { Instance tempInst; tempInst = new Instance(inst.numAttributes()); tempInst.setDataset(inst); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute can be ignored //Give the sequence and the featureList to get the feature freqs on the sequence Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2, featureDataArrayList.get(x), scoringMatrixIndex, countingStyleIndex, applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(x, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(x, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(x, (String) obj); else { dataset2OutputFile.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(inst.numAttributes() - 1, _class); double[] results = classifierOne.distributionForInstance(tempInst); dataset2OutputFile.write("" + results[0] + ","); } dataset2OutputFile.write(_class); dataset2OutputFile.newLine(); if (lineCounter == totalDataset2PositiveInstances) _class = "neg"; } dataset2OutputFile.close(); fastaFile.cleanUp(); } catch (Exception e) { e.printStackTrace(); JOptionPane.showMessageDialog(parent, e.getMessage(), "ERROR", JOptionPane.ERROR_MESSAGE); applicationData.getStatusPane().setText("Error - Classifier Two Training Not Complete"); return false; } return true; }
From source file:sirius.trainer.step4.RunClassifier.java
License:Open Source License
public static Classifier startClassifierOne(JInternalFrame parent, ApplicationData applicationData, JTextArea classifierOneDisplayTextArea, GenericObjectEditor m_ClassifierEditor, GraphPane myGraph, boolean test, ClassifierResults classifierResults, int range, double threshold) { try {/*from w ww. j av a2 s. co m*/ StatusPane statusPane = applicationData.getStatusPane(); long totalTimeStart = System.currentTimeMillis(), totalTimeElapsed; //Setting up training dataset 1 for classifier one statusPane.setText("Setting up..."); //Load Dataset1 Instances Instances inst = new Instances(applicationData.getDataset1Instances()); inst.setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1); applicationData.getDataset1Instances() .setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1); // for timing long trainTimeStart = 0, trainTimeElapsed = 0; Classifier classifierOne = (Classifier) m_ClassifierEditor.getValue(); statusPane.setText("Training Classifier One... May take a while... Please wait..."); trainTimeStart = System.currentTimeMillis(); inst.deleteAttributeType(Attribute.STRING); classifierOne.buildClassifier(inst); trainTimeElapsed = System.currentTimeMillis() - trainTimeStart; String classifierName = m_ClassifierEditor.getValue().getClass().getName(); classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", classifierName); classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ", applicationData.getWorkingDirectory() + File.separator + "Dataset1.arff"); classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ", Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds"); if (test == false) { statusPane.setText("Classifier One Training Completed...Done..."); return classifierOne; } if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier One Training Completed"); return classifierOne; } //Running classifier one on dataset3 if (statusPane != null) statusPane.setText("Running ClassifierOne on Dataset 3.."); //Step1TableModel positiveStep1TableModel = applicationData.getPositiveStep1TableModel(); //Step1TableModel negativeStep1TableModel = applicationData.getNegativeStep1TableModel(); int positiveDataset3FromInt = applicationData.getPositiveDataset3FromField(); int positiveDataset3ToInt = applicationData.getPositiveDataset3ToField(); int negativeDataset3FromInt = applicationData.getNegativeDataset3FromField(); int negativeDataset3ToInt = applicationData.getNegativeDataset3ToField(); //Generate the header for ClassifierOne.scores on Dataset3 BufferedWriter dataset3OutputFile = new BufferedWriter(new FileWriter( applicationData.getWorkingDirectory() + File.separator + "ClassifierOne.scores")); if (m_ClassifierEditor.getValue() instanceof OptionHandler) classifierName += " " + Utils.joinOptions(((OptionHandler) m_ClassifierEditor.getValue()).getOptions()); FastaFileManipulation fastaFile = new FastaFileManipulation( applicationData.getPositiveStep1TableModel(), applicationData.getNegativeStep1TableModel(), positiveDataset3FromInt, positiveDataset3ToInt, negativeDataset3FromInt, negativeDataset3ToInt, applicationData.getWorkingDirectory()); //Reading and Storing the featureList ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute must be ignored featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name())); } //Reading the fastaFile int lineCounter = 0; String _class = "pos"; int totalDataset3PositiveInstances = positiveDataset3ToInt - positiveDataset3FromInt + 1; FastaFormat fastaFormat; while ((fastaFormat = fastaFile.nextSequence(_class)) != null) { if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier One Training Completed"); dataset3OutputFile.close(); return classifierOne; } lineCounter++;//Putting it here will mean if lineCounter is x then line == sequence x dataset3OutputFile.write(fastaFormat.getHeader()); dataset3OutputFile.newLine(); dataset3OutputFile.write(fastaFormat.getSequence()); dataset3OutputFile.newLine(); //if((lineCounter % 100) == 0){ statusPane.setText("Running Classifier One on Dataset 3.. @ " + lineCounter + " / " + applicationData.getTotalSequences(3) + " Sequences"); //} // for +1 index being -1, only make one prediction for the whole sequence if (fastaFormat.getIndexLocation() == -1) { //Should not have reached here... dataset3OutputFile.close(); throw new Exception("SHOULD NOT HAVE REACHED HERE!!"); } else {// for +1 index being non -1, make prediction on every possible position //For each sequence, you want to shift from predictPositionFrom till predictPositionTo //ie changing the +1 location //to get the scores given by classifier one so that //you can use it to train classifier two later //Doing shift from predictPositionFrom till predictPositionTo int predictPosition[]; predictPosition = fastaFormat.getPredictPositionForClassifierOne( applicationData.getLeftMostPosition(), applicationData.getRightMostPosition()); SequenceManipulation seq = new SequenceManipulation(fastaFormat.getSequence(), predictPosition[0], predictPosition[1]); String line2; int currentPosition = predictPosition[0]; dataset3OutputFile.write(_class); while ((line2 = seq.nextShift()) != null) { Instance tempInst; tempInst = new Instance(inst.numAttributes()); tempInst.setDataset(inst); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute can be ignored //Give the sequence and the featureList to get the feature freqs on the sequence Object obj = GenerateArff.getMatchCount(fastaFormat.getHeader(), line2, featureDataArrayList.get(x), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(x, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(x, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(x, (String) obj); else { dataset3OutputFile.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(inst.numAttributes() - 1, _class); double[] results = classifierOne.distributionForInstance(tempInst); dataset3OutputFile.write("," + currentPosition + "=" + results[0]); //AHFU_DEBUG /*if(currentPosition >= setClassifierTwoUpstreamInt && currentPosition <= setClassifierTwoDownstreamInt) testClassifierTwoArff.write(results[0] + ",");*/ //AHFU_DEBUG_END currentPosition++; if (currentPosition == 0) currentPosition++; } // end of while((line2 = seq.nextShift())!=null) //AHFU_DEBUG /*testClassifierTwoArff.write(_class); testClassifierTwoArff.newLine(); testClassifierTwoArff.flush();*/ //AHFU_DEBUG_END dataset3OutputFile.newLine(); dataset3OutputFile.flush(); if (lineCounter == totalDataset3PositiveInstances) _class = "neg"; } //end of inside non -1 } // end of while((fastaFormat = fastaFile.nextSequence(_class))!=null) dataset3OutputFile.close(); PredictionStats classifierOneStatsOnBlindTest = new PredictionStats( applicationData.getWorkingDirectory() + File.separator + "ClassifierOne.scores", range, threshold); totalTimeElapsed = System.currentTimeMillis() - totalTimeStart; classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ", Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes " + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds"); classifierOneStatsOnBlindTest.updateDisplay(classifierResults, classifierOneDisplayTextArea, true); applicationData.setClassifierOneStats(classifierOneStatsOnBlindTest); myGraph.setMyStats(classifierOneStatsOnBlindTest); statusPane.setText("Done!"); fastaFile.cleanUp(); return classifierOne; } catch (Exception ex) { ex.printStackTrace(); JOptionPane.showMessageDialog(parent, ex.getMessage() + "Classifier One on Blind Test Set", "Evaluate classifier", JOptionPane.ERROR_MESSAGE); return null; } }
From source file:sirius.trainer.step4.RunClassifier.java
License:Open Source License
public static Classifier xValidateClassifierOne(JInternalFrame parent, ApplicationData applicationData, JTextArea classifierOneDisplayTextArea, GenericObjectEditor m_ClassifierEditor, int folds, GraphPane myGraph, ClassifierResults classifierResults, int range, double threshold, boolean outputClassifier) { try {/*from w ww. ja v a 2 s . co m*/ StatusPane statusPane = applicationData.getStatusPane(); long totalTimeStart = System.currentTimeMillis(), totalTimeElapsed; //Classifier tempClassifier = (Classifier) m_ClassifierEditor.getValue(); int positiveDataset1FromInt = applicationData.getPositiveDataset1FromField(); int positiveDataset1ToInt = applicationData.getPositiveDataset1ToField(); int negativeDataset1FromInt = applicationData.getNegativeDataset1FromField(); int negativeDataset1ToInt = applicationData.getNegativeDataset1ToField(); Step1TableModel positiveStep1TableModel = applicationData.getPositiveStep1TableModel(); Step1TableModel negativeStep1TableModel = applicationData.getNegativeStep1TableModel(); Instances inst = new Instances(applicationData.getDataset1Instances()); inst.setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1); //Train classifier one with the full dataset first then do cross-validation to gauge its accuracy long trainTimeStart = 0, trainTimeElapsed = 0; Classifier classifierOne = (Classifier) m_ClassifierEditor.getValue(); statusPane.setText("Training Classifier One... May take a while... Please wait..."); //Record Start Time trainTimeStart = System.currentTimeMillis(); inst.deleteAttributeType(Attribute.STRING); if (outputClassifier) classifierOne.buildClassifier(inst); //Record Total Time used to build classifier one trainTimeElapsed = System.currentTimeMillis() - trainTimeStart; //Training Done String classifierName = m_ClassifierEditor.getValue().getClass().getName(); classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", classifierName); classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ", folds + " fold cross-validation on Dataset1.arff"); classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ", Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds"); //Reading and Storing the featureList ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int y = 0; y < inst.numAttributes() - 1; y++) { featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(y).name())); } BufferedWriter outputCrossValidation = new BufferedWriter(new FileWriter( applicationData.getWorkingDirectory() + File.separator + "ClassifierOne.scores")); for (int x = 0; x < folds; x++) { File trainFile = new File(applicationData.getWorkingDirectory() + File.separator + "trainingDataset1_" + (x + 1) + ".arff"); File testFile = new File(applicationData.getWorkingDirectory() + File.separator + "testingDataset1_" + (x + 1) + ".fasta"); //AHFU_DEBUG //Generate also the training file in fasta format for debugging purpose File trainFileFasta = new File(applicationData.getWorkingDirectory() + File.separator + "trainingDataset1_" + (x + 1) + ".fasta"); //AHFU_DEBUG_END //AHFU_DEBUG - This part is to generate the TestClassifierTwo.arff for use in WEKA to test classifierTwo //TestClassifierTwo.arff - predictions scores from Set Upstream Field to Set Downstream Field //Now first generate the header for TestClassifierTwo.arff BufferedWriter testClassifierTwoArff = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "TestClassifierTwo_" + (x + 1) + ".arff")); int setClassifierTwoUpstreamInt = -40; int setClassifierTwoDownstreamInt = 41; testClassifierTwoArff.write("@relation \'Used to Test Classifier Two\'"); testClassifierTwoArff.newLine(); for (int d = setClassifierTwoUpstreamInt; d <= setClassifierTwoDownstreamInt; d++) { if (d == 0) continue; testClassifierTwoArff.write("@attribute (" + d + ") numeric"); testClassifierTwoArff.newLine(); } if (positiveDataset1FromInt > 0 && negativeDataset1FromInt > 0) testClassifierTwoArff.write("@attribute Class {pos,neg}"); else if (positiveDataset1FromInt > 0 && negativeDataset1FromInt == 0) testClassifierTwoArff.write("@attribute Class {pos}"); else if (positiveDataset1FromInt == 0 && negativeDataset1FromInt > 0) testClassifierTwoArff.write("@attribute Class {neg}"); testClassifierTwoArff.newLine(); testClassifierTwoArff.newLine(); testClassifierTwoArff.write("@data"); testClassifierTwoArff.newLine(); testClassifierTwoArff.newLine(); //END of AHFU_DEBUG statusPane.setText("Building Fold " + (x + 1) + "..."); FastaFileManipulation fastaFile = new FastaFileManipulation(positiveStep1TableModel, negativeStep1TableModel, positiveDataset1FromInt, positiveDataset1ToInt, negativeDataset1FromInt, negativeDataset1ToInt, applicationData.getWorkingDirectory()); //1) generate trainingDatasetX.arff headings BufferedWriter trainingOutputFile = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "trainingDataset1_" + (x + 1) + ".arff")); trainingOutputFile.write("@relation 'A temp file for X-validation purpose' "); trainingOutputFile.newLine(); trainingOutputFile.newLine(); trainingOutputFile.flush(); for (int y = 0; y < inst.numAttributes() - 1; y++) { if (inst.attribute(y).type() == Attribute.NUMERIC) trainingOutputFile.write("@attribute " + inst.attribute(y).name() + " numeric"); else if (inst.attribute(y).type() == Attribute.STRING) trainingOutputFile.write("@attribute " + inst.attribute(y).name() + " String"); else { testClassifierTwoArff.close(); outputCrossValidation.close(); trainingOutputFile.close(); throw new Error("Unknown type: " + inst.attribute(y).name()); } trainingOutputFile.newLine(); trainingOutputFile.flush(); } if (positiveDataset1FromInt > 0 && negativeDataset1FromInt > 0) trainingOutputFile.write("@attribute Class {pos,neg}"); else if (positiveDataset1FromInt > 0 && negativeDataset1FromInt == 0) trainingOutputFile.write("@attribute Class {pos}"); else if (positiveDataset1FromInt == 0 && negativeDataset1FromInt > 0) trainingOutputFile.write("@attribute Class {neg}"); trainingOutputFile.newLine(); trainingOutputFile.newLine(); trainingOutputFile.write("@data"); trainingOutputFile.newLine(); trainingOutputFile.newLine(); trainingOutputFile.flush(); //2) generate testingDataset1.fasta BufferedWriter testingOutputFile = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "testingDataset1_" + (x + 1) + ".fasta")); //AHFU_DEBUG //Open the IOStream for training file (fasta format) BufferedWriter trainingOutputFileFasta = new BufferedWriter( new FileWriter(applicationData.getWorkingDirectory() + File.separator + "trainingDataset1_" + (x + 1) + ".fasta")); //AHFU_DEBUG_END //Now, populating data for both the training and testing files int fastaFileLineCounter = 0; int posTestSequenceCounter = 0; int totalTestSequenceCounter = 0; //For pos sequences FastaFormat fastaFormat; while ((fastaFormat = fastaFile.nextSequence("pos")) != null) { if ((fastaFileLineCounter % folds) == x) {//This sequence for testing testingOutputFile.write(fastaFormat.getHeader()); testingOutputFile.newLine(); testingOutputFile.write(fastaFormat.getSequence()); testingOutputFile.newLine(); testingOutputFile.flush(); posTestSequenceCounter++; totalTestSequenceCounter++; } else {//for training for (int z = 0; z < inst.numAttributes() - 1; z++) { trainingOutputFile.write(GenerateArff.getMatchCount(fastaFormat, featureDataArrayList.get(z), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()) + ","); } trainingOutputFile.write("pos"); trainingOutputFile.newLine(); trainingOutputFile.flush(); //AHFU_DEBUG //Write the datas into the training file in fasta format trainingOutputFileFasta.write(fastaFormat.getHeader()); trainingOutputFileFasta.newLine(); trainingOutputFileFasta.write(fastaFormat.getSequence()); trainingOutputFileFasta.newLine(); trainingOutputFileFasta.flush(); //AHFU_DEBUG_END } fastaFileLineCounter++; } //For neg sequences fastaFileLineCounter = 0; while ((fastaFormat = fastaFile.nextSequence("neg")) != null) { if ((fastaFileLineCounter % folds) == x) {//This sequence for testing testingOutputFile.write(fastaFormat.getHeader()); testingOutputFile.newLine(); testingOutputFile.write(fastaFormat.getSequence()); testingOutputFile.newLine(); testingOutputFile.flush(); totalTestSequenceCounter++; } else {//for training for (int z = 0; z < inst.numAttributes() - 1; z++) { trainingOutputFile.write(GenerateArff.getMatchCount(fastaFormat, featureDataArrayList.get(z), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()) + ","); } trainingOutputFile.write("neg"); trainingOutputFile.newLine(); trainingOutputFile.flush(); //AHFU_DEBUG //Write the datas into the training file in fasta format trainingOutputFileFasta.write(fastaFormat.getHeader()); trainingOutputFileFasta.newLine(); trainingOutputFileFasta.write(fastaFormat.getSequence()); trainingOutputFileFasta.newLine(); trainingOutputFileFasta.flush(); //AHFU_DEBUG_END } fastaFileLineCounter++; } trainingOutputFileFasta.close(); trainingOutputFile.close(); testingOutputFile.close(); //3) train and test the classifier then store the statistics Classifier foldClassifier = (Classifier) m_ClassifierEditor.getValue(); Instances instFoldTrain = new Instances( new BufferedReader(new FileReader(applicationData.getWorkingDirectory() + File.separator + "trainingDataset1_" + (x + 1) + ".arff"))); instFoldTrain.setClassIndex(instFoldTrain.numAttributes() - 1); foldClassifier.buildClassifier(instFoldTrain); //Reading the test file statusPane.setText("Evaluating fold " + (x + 1) + ".."); BufferedReader testingInput = new BufferedReader( new FileReader(applicationData.getWorkingDirectory() + File.separator + "testingDataset1_" + (x + 1) + ".fasta")); int lineCounter = 0; String lineHeader; String lineSequence; while ((lineHeader = testingInput.readLine()) != null) { if (applicationData.terminateThread == true) { statusPane.setText("Interrupted - Classifier One Training Completed"); testingInput.close(); testClassifierTwoArff.close(); return classifierOne; } lineSequence = testingInput.readLine(); outputCrossValidation.write(lineHeader); outputCrossValidation.newLine(); outputCrossValidation.write(lineSequence); outputCrossValidation.newLine(); lineCounter++; //For each sequence, you want to shift from upstream till downstream //ie changing the +1 location //to get the scores by classifier one so that can use it to train classifier two later //Doing shift from upstream till downstream //if(lineCounter % 100 == 0) statusPane.setText("Evaluating fold " + (x + 1) + ".. @ " + lineCounter + " / " + totalTestSequenceCounter); fastaFormat = new FastaFormat(lineHeader, lineSequence); int predictPosition[] = fastaFormat.getPredictPositionForClassifierOne( applicationData.getLeftMostPosition(), applicationData.getRightMostPosition()); SequenceManipulation seq = new SequenceManipulation(lineSequence, predictPosition[0], predictPosition[1]); int currentPosition = predictPosition[0]; String line2; if (lineCounter > posTestSequenceCounter) outputCrossValidation.write("neg"); else outputCrossValidation.write("pos"); while ((line2 = seq.nextShift()) != null) { Instance tempInst; tempInst = new Instance(inst.numAttributes()); tempInst.setDataset(inst); for (int i = 0; i < inst.numAttributes() - 1; i++) { //-1 because class attribute can be ignored //Give the sequence and the featureList to get the feature freqs on the sequence Object obj = GenerateArff.getMatchCount(lineHeader, line2, featureDataArrayList.get(i), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(x, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(x, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(x, (String) obj); else { testingInput.close(); testClassifierTwoArff.close(); outputCrossValidation.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } if (lineCounter > posTestSequenceCounter) tempInst.setValue(inst.numAttributes() - 1, "neg"); else tempInst.setValue(inst.numAttributes() - 1, "pos"); double[] results = foldClassifier.distributionForInstance(tempInst); outputCrossValidation.write("," + currentPosition + "=" + results[0]); //AHFU_DEBUG double[] resultsDebug = classifierOne.distributionForInstance(tempInst); if (currentPosition >= setClassifierTwoUpstreamInt && currentPosition <= setClassifierTwoDownstreamInt) testClassifierTwoArff.write(resultsDebug[0] + ","); //AHFU_DEBUG_END currentPosition++; if (currentPosition == 0) currentPosition++; } //end of sequence shift outputCrossValidation.newLine(); outputCrossValidation.flush(); //AHFU_DEBUG if (lineCounter > posTestSequenceCounter) testClassifierTwoArff.write("neg"); else testClassifierTwoArff.write("pos"); testClassifierTwoArff.newLine(); testClassifierTwoArff.flush(); //AHFU_DEBUG_END } //end of reading test file outputCrossValidation.close(); testingInput.close(); testClassifierTwoArff.close(); fastaFile.cleanUp(); //NORMAL MODE //trainFile.delete(); //testFile.delete(); //NORMAL MODE END //AHFU_DEBUG MODE //testClassifierTwoArff.close(); trainFile.deleteOnExit(); testFile.deleteOnExit(); trainFileFasta.deleteOnExit(); //AHFU_DEBUG_MODE_END } //end of for loop for xvalidation PredictionStats classifierOneStatsOnXValidation = new PredictionStats( applicationData.getWorkingDirectory() + File.separator + "ClassifierOne.scores", range, threshold); //display(double range) totalTimeElapsed = System.currentTimeMillis() - totalTimeStart; classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ", Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes " + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds"); classifierOneStatsOnXValidation.updateDisplay(classifierResults, classifierOneDisplayTextArea, true); applicationData.setClassifierOneStats(classifierOneStatsOnXValidation); myGraph.setMyStats(classifierOneStatsOnXValidation); statusPane.setText("Done!"); return classifierOne; } catch (Exception e) { e.printStackTrace(); JOptionPane.showMessageDialog(parent, e.getMessage(), "ERROR", JOptionPane.ERROR_MESSAGE); return null; } }
From source file:sirius.trainer.step4.RunClassifierWithNoLocationIndex.java
License:Open Source License
public static Object startClassifierOneWithNoLocationIndex(JInternalFrame parent, ApplicationData applicationData, JTextArea classifierOneDisplayTextArea, GraphPane myGraph, boolean test, ClassifierResults classifierResults, int range, double threshold, String classifierName, String[] classifierOptions, boolean returnClassifier, GeneticAlgorithmDialog gaDialog, int randomNumberForClassifier) { try {//from ww w . j a v a 2s . c o m if (gaDialog != null) { //Run GA then load the result maxMCCFeatures into applicationData->Dataset1Instances int positiveDataset1FromInt = applicationData.getPositiveDataset1FromField(); int positiveDataset1ToInt = applicationData.getPositiveDataset1ToField(); int negativeDataset1FromInt = applicationData.getNegativeDataset1FromField(); int negativeDataset1ToInt = applicationData.getNegativeDataset1ToField(); FastaFileManipulation fastaFile = new FastaFileManipulation( applicationData.getPositiveStep1TableModel(), applicationData.getNegativeStep1TableModel(), positiveDataset1FromInt, positiveDataset1ToInt, negativeDataset1FromInt, negativeDataset1ToInt, applicationData.getWorkingDirectory()); FastaFormat fastaFormat; List<FastaFormat> posFastaList = new ArrayList<FastaFormat>(); List<FastaFormat> negFastaList = new ArrayList<FastaFormat>(); while ((fastaFormat = fastaFile.nextSequence("pos")) != null) { posFastaList.add(fastaFormat); } while ((fastaFormat = fastaFile.nextSequence("neg")) != null) { negFastaList.add(fastaFormat); } applicationData.setDataset1Instances( runDAandLoadResult(applicationData, gaDialog, posFastaList, negFastaList)); } StatusPane statusPane = applicationData.getStatusPane(); long totalTimeStart = System.currentTimeMillis(), totalTimeElapsed; //Setting up training data set 1 for classifier one if (statusPane != null) statusPane.setText("Setting up..."); //Load Dataset1 Instances Instances inst = new Instances(applicationData.getDataset1Instances()); inst.setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1); applicationData.getDataset1Instances() .setClassIndex(applicationData.getDataset1Instances().numAttributes() - 1); // for recording of time long trainTimeStart = 0, trainTimeElapsed = 0; Classifier classifierOne = Classifier.forName(classifierName, classifierOptions); /*//Used to show the classifierName and options so that I can use them for qsub System.out.println(classifierName); String[] optionString = classifierOne.getOptions(); for(int x = 0; x < optionString.length; x++) System.out.println(optionString[x]);*/ if (statusPane != null) statusPane.setText("Training Classifier One... May take a while... Please wait..."); //Record Start Time trainTimeStart = System.currentTimeMillis(); //Train Classifier One inst.deleteAttributeType(Attribute.STRING); classifierOne.buildClassifier(inst); //Record Total Time used to build classifier one trainTimeElapsed = System.currentTimeMillis() - trainTimeStart; if (classifierResults != null) { classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", classifierName); classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ", applicationData.getWorkingDirectory() + File.separator + "Dataset1.arff"); classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ", Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds"); } if (test == false) { //If Need Not Test option is selected if (statusPane != null) statusPane.setText("Done!"); return classifierOne; } if (applicationData.terminateThread == true) { //If Stop button is pressed if (statusPane != null) statusPane.setText("Interrupted - Classifier One Training Completed"); return classifierOne; } //Running classifier one on dataset3 if (statusPane != null) statusPane.setText("Running ClassifierOne on Dataset 3.."); int positiveDataset3FromInt = applicationData.getPositiveDataset3FromField(); int positiveDataset3ToInt = applicationData.getPositiveDataset3ToField(); int negativeDataset3FromInt = applicationData.getNegativeDataset3FromField(); int negativeDataset3ToInt = applicationData.getNegativeDataset3ToField(); //Generate the header for ClassifierOne.scores on Dataset3 String classifierOneFilename = applicationData.getWorkingDirectory() + File.separator + "ClassifierOne_" + randomNumberForClassifier + ".scores"; BufferedWriter dataset3OutputFile = new BufferedWriter(new FileWriter(classifierOneFilename)); FastaFileManipulation fastaFile = new FastaFileManipulation( applicationData.getPositiveStep1TableModel(), applicationData.getNegativeStep1TableModel(), positiveDataset3FromInt, positiveDataset3ToInt, negativeDataset3FromInt, negativeDataset3ToInt, applicationData.getWorkingDirectory()); //Reading and Storing the featureList ArrayList<Feature> featureDataArrayList = new ArrayList<Feature>(); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute must be ignored featureDataArrayList.add(Feature.levelOneClassifierPane(inst.attribute(x).name())); } //Reading the fastaFile int lineCounter = 0; String _class = "pos"; int totalDataset3PositiveInstances = positiveDataset3ToInt - positiveDataset3FromInt + 1; FastaFormat fastaFormat; while ((fastaFormat = fastaFile.nextSequence(_class)) != null) { if (applicationData.terminateThread == true) { if (statusPane != null) statusPane.setText("Interrupted - Classifier One Training Completed"); dataset3OutputFile.close(); return classifierOne; } dataset3OutputFile.write(fastaFormat.getHeader()); dataset3OutputFile.newLine(); dataset3OutputFile.write(fastaFormat.getSequence()); dataset3OutputFile.newLine(); lineCounter++;//Putting it here will mean if lineCounter is x then line == sequence x dataset3OutputFile.flush(); if (statusPane != null) statusPane.setText("Running Classifier One on Dataset 3.. @ " + lineCounter + " / " + applicationData.getTotalSequences(3) + " Sequences"); Instance tempInst; tempInst = new Instance(inst.numAttributes()); tempInst.setDataset(inst); for (int x = 0; x < inst.numAttributes() - 1; x++) { //-1 because class attribute can be ignored //Give the sequence and the featureList to get the feature freqs on the sequence Object obj = GenerateArff.getMatchCount(fastaFormat, featureDataArrayList.get(x), applicationData.getScoringMatrixIndex(), applicationData.getCountingStyleIndex(), applicationData.getScoringMatrix()); if (obj.getClass().getName().equalsIgnoreCase("java.lang.Integer")) tempInst.setValue(x, (Integer) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.Double")) tempInst.setValue(x, (Double) obj); else if (obj.getClass().getName().equalsIgnoreCase("java.lang.String")) tempInst.setValue(x, (String) obj); else { dataset3OutputFile.close(); throw new Error("Unknown: " + obj.getClass().getName()); } } tempInst.setValue(inst.numAttributes() - 1, _class); double[] results = classifierOne.distributionForInstance(tempInst); dataset3OutputFile.write(_class + ",0=" + results[0]); dataset3OutputFile.newLine(); dataset3OutputFile.flush(); if (lineCounter == totalDataset3PositiveInstances) _class = "neg"; } dataset3OutputFile.close(); //Display Statistics by reading the ClassifierOne.scores PredictionStats classifierOneStatsOnBlindTest = new PredictionStats(classifierOneFilename, range, threshold); //display(double range) totalTimeElapsed = System.currentTimeMillis() - totalTimeStart; if (classifierResults != null) { classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ", Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes " + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds"); classifierOneStatsOnBlindTest.updateDisplay(classifierResults, classifierOneDisplayTextArea, true); } else classifierOneStatsOnBlindTest.updateDisplay(classifierResults, classifierOneDisplayTextArea, true); applicationData.setClassifierOneStats(classifierOneStatsOnBlindTest); if (myGraph != null) myGraph.setMyStats(classifierOneStatsOnBlindTest); if (statusPane != null) statusPane.setText("Done!"); fastaFile.cleanUp(); if (returnClassifier) return classifierOne; else return classifierOneStatsOnBlindTest; } catch (Exception ex) { ex.printStackTrace(); JOptionPane.showMessageDialog(parent, ex.getMessage(), "Evaluate classifier", JOptionPane.ERROR_MESSAGE); return null; } }
From source file:sirius.trainer.step4.RunClassifierWithNoLocationIndex.java
License:Open Source License
public static Object jackKnifeClassifierOneWithNoLocationIndex(JInternalFrame parent, ApplicationData applicationData, JTextArea classifierOneDisplayTextArea, GenericObjectEditor m_ClassifierEditor, double ratio, GraphPane myGraph, ClassifierResults classifierResults, int range, double threshold, boolean outputClassifier, String classifierName, String[] classifierOptions, boolean returnClassifier, int randomNumberForClassifier) { try {//from ww w .j ava2 s . co m StatusPane statusPane = applicationData.getStatusPane(); long totalTimeStart = System.currentTimeMillis(), totalTimeElapsed; Classifier tempClassifier; if (m_ClassifierEditor != null) tempClassifier = (Classifier) m_ClassifierEditor.getValue(); else tempClassifier = Classifier.forName(classifierName, classifierOptions); //Assume that class attribute is the last attribute - This should be the case for all Sirius produced Arff files //split the instances into positive and negative Instances posInst = new Instances(applicationData.getDataset1Instances()); posInst.setClassIndex(posInst.numAttributes() - 1); for (int x = 0; x < posInst.numInstances();) if (posInst.instance(x).stringValue(posInst.numAttributes() - 1).equalsIgnoreCase("pos")) x++; else posInst.delete(x); posInst.deleteAttributeType(Attribute.STRING); Instances negInst = new Instances(applicationData.getDataset1Instances()); negInst.setClassIndex(negInst.numAttributes() - 1); for (int x = 0; x < negInst.numInstances();) if (negInst.instance(x).stringValue(negInst.numAttributes() - 1).equalsIgnoreCase("neg")) x++; else negInst.delete(x); negInst.deleteAttributeType(Attribute.STRING); //Train classifier one with the full dataset first then do cross-validation to gauge its accuracy long trainTimeStart = 0, trainTimeElapsed = 0; if (statusPane != null) statusPane.setText("Training Classifier One... May take a while... Please wait..."); //Record Start Time trainTimeStart = System.currentTimeMillis(); Instances fullInst = new Instances(applicationData.getDataset1Instances()); fullInst.setClassIndex(fullInst.numAttributes() - 1); Classifier classifierOne; if (m_ClassifierEditor != null) classifierOne = (Classifier) m_ClassifierEditor.getValue(); else classifierOne = Classifier.forName(classifierName, classifierOptions); if (outputClassifier) classifierOne.buildClassifier(fullInst); //Record Total Time used to build classifier one trainTimeElapsed = System.currentTimeMillis() - trainTimeStart; //Training Done String tclassifierName; if (m_ClassifierEditor != null) tclassifierName = m_ClassifierEditor.getValue().getClass().getName(); else tclassifierName = classifierName; if (classifierResults != null) { classifierResults.updateList(classifierResults.getClassifierList(), "Classifier: ", tclassifierName); classifierResults.updateList(classifierResults.getClassifierList(), "Training Data: ", " Jack Knife Validation"); classifierResults.updateList(classifierResults.getClassifierList(), "Time Used: ", Utils.doubleToString(trainTimeElapsed / 1000.0, 2) + " seconds"); } String classifierOneFilename = applicationData.getWorkingDirectory() + File.separator + "ClassifierOne_" + randomNumberForClassifier + ".scores"; BufferedWriter outputCrossValidation = new BufferedWriter(new FileWriter(classifierOneFilename)); //Instances foldTrainingInstance; //Instances foldTestingInstance; int positiveDataset1FromInt = applicationData.getPositiveDataset1FromField(); int positiveDataset1ToInt = applicationData.getPositiveDataset1ToField(); int negativeDataset1FromInt = applicationData.getNegativeDataset1FromField(); int negativeDataset1ToInt = applicationData.getNegativeDataset1ToField(); Step1TableModel positiveStep1TableModel = applicationData.getPositiveStep1TableModel(); Step1TableModel negativeStep1TableModel = applicationData.getNegativeStep1TableModel(); FastaFileManipulation fastaFile = new FastaFileManipulation(positiveStep1TableModel, negativeStep1TableModel, positiveDataset1FromInt, positiveDataset1ToInt, negativeDataset1FromInt, negativeDataset1ToInt, applicationData.getWorkingDirectory()); FastaFormat fastaFormat; String header[] = new String[fullInst.numInstances()]; String data[] = new String[fullInst.numInstances()]; int counter = 0; while ((fastaFormat = fastaFile.nextSequence("pos")) != null) { header[counter] = fastaFormat.getHeader(); data[counter] = fastaFormat.getSequence(); counter++; } while ((fastaFormat = fastaFile.nextSequence("neg")) != null) { header[counter] = fastaFormat.getHeader(); data[counter] = fastaFormat.getSequence(); counter++; } //run jack knife validation for (int x = 0; x < fullInst.numInstances(); x++) { if (applicationData.terminateThread == true) { if (statusPane != null) statusPane.setText("Interrupted - Classifier One Training Completed"); outputCrossValidation.close(); return classifierOne; } if (statusPane != null) statusPane.setText("Running " + (x + 1) + " / " + fullInst.numInstances()); Instances trainPosInst = new Instances(posInst); Instances trainNegInst = new Instances(negInst); Instance testInst; //split data into training and testing if (x < trainPosInst.numInstances()) { testInst = posInst.instance(x); trainPosInst.delete(x); } else { testInst = negInst.instance(x - posInst.numInstances()); trainNegInst.delete(x - posInst.numInstances()); } Instances trainInstances; if (trainPosInst.numInstances() < trainNegInst.numInstances()) { trainInstances = new Instances(trainPosInst); int max = (int) (ratio * trainPosInst.numInstances()); if (ratio == -1) max = trainNegInst.numInstances(); Random rand = new Random(1); for (int y = 0; y < trainNegInst.numInstances() && y < max; y++) { int index = rand.nextInt(trainNegInst.numInstances()); trainInstances.add(trainNegInst.instance(index)); trainNegInst.delete(index); } } else { trainInstances = new Instances(trainNegInst); int max = (int) (ratio * trainNegInst.numInstances()); if (ratio == -1) max = trainPosInst.numInstances(); Random rand = new Random(1); for (int y = 0; y < trainPosInst.numInstances() && y < max; y++) { int index = rand.nextInt(trainPosInst.numInstances()); trainInstances.add(trainPosInst.instance(index)); trainPosInst.delete(index); } } Classifier foldClassifier = tempClassifier; foldClassifier.buildClassifier(trainInstances); double[] results = foldClassifier.distributionForInstance(testInst); int classIndex = testInst.classIndex(); String classValue = testInst.toString(classIndex); outputCrossValidation.write(header[x]); outputCrossValidation.newLine(); outputCrossValidation.write(data[x]); outputCrossValidation.newLine(); if (classValue.equals("pos")) outputCrossValidation.write("pos,0=" + results[0]); else if (classValue.equals("neg")) outputCrossValidation.write("neg,0=" + results[0]); else { outputCrossValidation.close(); throw new Error("Invalid Class Type!"); } outputCrossValidation.newLine(); outputCrossValidation.flush(); } outputCrossValidation.close(); PredictionStats classifierOneStatsOnJackKnife = new PredictionStats(classifierOneFilename, range, threshold); totalTimeElapsed = System.currentTimeMillis() - totalTimeStart; if (classifierResults != null) classifierResults.updateList(classifierResults.getResultsList(), "Total Time Used: ", Utils.doubleToString(totalTimeElapsed / 60000, 2) + " minutes " + Utils.doubleToString((totalTimeElapsed / 1000.0) % 60.0, 2) + " seconds"); //if(classifierOneDisplayTextArea != null) classifierOneStatsOnJackKnife.updateDisplay(classifierResults, classifierOneDisplayTextArea, true); applicationData.setClassifierOneStats(classifierOneStatsOnJackKnife); if (myGraph != null) myGraph.setMyStats(classifierOneStatsOnJackKnife); if (statusPane != null) statusPane.setText("Done!"); if (returnClassifier) return classifierOne; else return classifierOneStatsOnJackKnife; } catch (Exception e) { e.printStackTrace(); JOptionPane.showMessageDialog(parent, e.getMessage(), "ERROR", JOptionPane.ERROR_MESSAGE); return null; } }