com.civprod.writerstoolbox.OpenNLP.training.WordSplitingTokenizerTrainer.java Source code

Java tutorial

Introduction

Here is the source code for com.civprod.writerstoolbox.OpenNLP.training.WordSplitingTokenizerTrainer.java

Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */
package com.civprod.writerstoolbox.OpenNLP.training;

import com.civprod.swing.CollectionListModel;
import com.civprod.util.TimeComplexity;
import com.civprod.writerstoolbox.OpenNLP.OpenNLPUtils;
import com.civprod.writerstoolbox.OpenNLP.tokenize.WordSplittingTokenizerFactory;
import com.civprod.writerstoolbox.data.io.localFS.FileUtils;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.charset.Charset;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
import javax.swing.JFileChooser;
import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.tokenize.TokenSampleStream;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerEvaluator;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.FMeasure;
import org.apache.commons.math3.stat.descriptive.SummaryStatistics;

/**
 *
 * @author Steven Owens
 */
public class WordSplitingTokenizerTrainer extends EditorWindow<TokenizerModel> {

    private Dictionary mAbbreviationDictionary = null;
    private Dictionary mSpellingDictionary = null;
    private CollectionListModel<File> mFileCollectionListModel;

    /**
     * Creates new form WordSplitingTokenizer
     */
    public WordSplitingTokenizerTrainer() {
        initComponents();
        mFileCollectionListModel = new CollectionListModel<>();
        listFiles.setModel(mFileCollectionListModel);
        comboTimeComplexity.setModel(new CollectionListModel<>(java.util.EnumSet.allOf(TimeComplexity.class)));
    }

    /**
     * This method is called from within the constructor to initialize the form.
     * WARNING: Do NOT modify this code. The content of this method is always
     * regenerated by the Form Editor.
     */
    @SuppressWarnings("unchecked")
    // <editor-fold defaultstate="collapsed" desc="Generated Code">//GEN-BEGIN:initComponents
    private void initComponents() {

        myFileChooser = new javax.swing.JFileChooser();
        cmdCreateAbbreviationDictionary = new javax.swing.JButton();
        cmdLoadAbbreviationDictionary = new javax.swing.JButton();
        cmdCreateSpellingDictionary = new javax.swing.JButton();
        cmdLoadSpellingDictionary = new javax.swing.JButton();
        jScrollPane1 = new javax.swing.JScrollPane();
        listFiles = new javax.swing.JList<File>();
        cmdLoadTrainingFile = new javax.swing.JButton();
        cmdDeleteTrainingFile = new javax.swing.JButton();
        cmdTrain = new javax.swing.JButton();
        jScrollPane2 = new javax.swing.JScrollPane();
        textTestResults = new javax.swing.JTextArea();
        comboTimeComplexity = new javax.swing.JComboBox<TimeComplexity>();

        setDefaultCloseOperation(javax.swing.WindowConstants.EXIT_ON_CLOSE);

        cmdCreateAbbreviationDictionary.setText("create abbreviation Dictionary");
        cmdCreateAbbreviationDictionary.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                cmdCreateAbbreviationDictionaryActionPerformed(evt);
            }
        });

        cmdLoadAbbreviationDictionary.setText("load abbreviation Dictionary");
        cmdLoadAbbreviationDictionary.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                cmdLoadAbbreviationDictionaryActionPerformed(evt);
            }
        });

        cmdCreateSpellingDictionary.setText("create spelling Dictionary");
        cmdCreateSpellingDictionary.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                cmdCreateSpellingDictionaryActionPerformed(evt);
            }
        });

        cmdLoadSpellingDictionary.setText("load spelling Dictionary");
        cmdLoadSpellingDictionary.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                cmdLoadSpellingDictionaryActionPerformed(evt);
            }
        });

        listFiles.setModel(new javax.swing.AbstractListModel() {
            String[] strings = { "Item 1", "Item 2", "Item 3", "Item 4", "Item 5" };

            public int getSize() {
                return strings.length;
            }

            public Object getElementAt(int i) {
                return strings[i];
            }
        });
        jScrollPane1.setViewportView(listFiles);

        cmdLoadTrainingFile.setText("load training file");
        cmdLoadTrainingFile.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                cmdLoadTrainingFileActionPerformed(evt);
            }
        });

        cmdDeleteTrainingFile.setText("delete training file");
        cmdDeleteTrainingFile.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                cmdDeleteTrainingFileActionPerformed(evt);
            }
        });

        cmdTrain.setText("train model");
        cmdTrain.addActionListener(new java.awt.event.ActionListener() {
            public void actionPerformed(java.awt.event.ActionEvent evt) {
                cmdTrainActionPerformed(evt);
            }
        });

        textTestResults.setColumns(20);
        textTestResults.setRows(5);
        jScrollPane2.setViewportView(textTestResults);

        comboTimeComplexity.setModel(
                new javax.swing.DefaultComboBoxModel(new String[] { "Item 1", "Item 2", "Item 3", "Item 4" }));

        javax.swing.GroupLayout layout = new javax.swing.GroupLayout(getContentPane());
        getContentPane().setLayout(layout);
        layout.setHorizontalGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
                .addGroup(layout.createSequentialGroup().addContainerGap().addGroup(layout
                        .createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING).addComponent(jScrollPane2)
                        .addGroup(layout.createSequentialGroup().addGroup(layout
                                .createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
                                .addGroup(
                                        layout.createSequentialGroup().addComponent(cmdCreateAbbreviationDictionary)
                                                .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
                                                .addComponent(cmdLoadAbbreviationDictionary))
                                .addGroup(layout.createSequentialGroup().addComponent(cmdCreateSpellingDictionary)
                                        .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
                                        .addComponent(cmdLoadSpellingDictionary))
                                .addGroup(layout.createSequentialGroup()
                                        .addComponent(jScrollPane1, javax.swing.GroupLayout.PREFERRED_SIZE,
                                                javax.swing.GroupLayout.DEFAULT_SIZE,
                                                javax.swing.GroupLayout.PREFERRED_SIZE)
                                        .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
                                        .addGroup(layout
                                                .createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
                                                .addComponent(cmdLoadTrainingFile)
                                                .addComponent(cmdDeleteTrainingFile)))
                                .addComponent(cmdTrain).addComponent(comboTimeComplexity,
                                        javax.swing.GroupLayout.PREFERRED_SIZE,
                                        javax.swing.GroupLayout.DEFAULT_SIZE,
                                        javax.swing.GroupLayout.PREFERRED_SIZE))
                                .addGap(0, 0, Short.MAX_VALUE)))
                        .addContainerGap()));
        layout.setVerticalGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
                .addGroup(layout.createSequentialGroup().addContainerGap()
                        .addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.BASELINE)
                                .addComponent(cmdCreateAbbreviationDictionary)
                                .addComponent(cmdLoadAbbreviationDictionary))
                        .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
                        .addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.BASELINE)
                                .addComponent(cmdCreateSpellingDictionary).addComponent(cmdLoadSpellingDictionary))
                        .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
                        .addComponent(comboTimeComplexity, javax.swing.GroupLayout.PREFERRED_SIZE,
                                javax.swing.GroupLayout.DEFAULT_SIZE, javax.swing.GroupLayout.PREFERRED_SIZE)
                        .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
                        .addGroup(layout.createParallelGroup(javax.swing.GroupLayout.Alignment.LEADING)
                                .addComponent(jScrollPane1, javax.swing.GroupLayout.PREFERRED_SIZE,
                                        javax.swing.GroupLayout.DEFAULT_SIZE,
                                        javax.swing.GroupLayout.PREFERRED_SIZE)
                                .addGroup(layout.createSequentialGroup().addComponent(cmdLoadTrainingFile)
                                        .addPreferredGap(javax.swing.LayoutStyle.ComponentPlacement.RELATED)
                                        .addComponent(cmdDeleteTrainingFile)))
                        .addGap(18, 18, 18).addComponent(cmdTrain).addGap(18, 18, 18).addComponent(jScrollPane2,
                                javax.swing.GroupLayout.PREFERRED_SIZE, 212, javax.swing.GroupLayout.PREFERRED_SIZE)
                        .addContainerGap(142, Short.MAX_VALUE)));

        pack();
    }// </editor-fold>//GEN-END:initComponents

    private void cmdCreateAbbreviationDictionaryActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdCreateAbbreviationDictionaryActionPerformed
        final WordSplitingTokenizerTrainer tempThis = this;
        new Thread(() -> {
            final DictionaryEditor newDictionaryEditor = new DictionaryEditor();
            newDictionaryEditor.runOnClose(() -> {
                mAbbreviationDictionary = newDictionaryEditor.getCreatedObject();
                tempThis.setVisible(true);
            });
            tempThis.setVisible(false);
            newDictionaryEditor.setVisible(true);
        }).start();
    }//GEN-LAST:event_cmdCreateAbbreviationDictionaryActionPerformed

    private void cmdLoadAbbreviationDictionaryActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdLoadAbbreviationDictionaryActionPerformed
        final WordSplitingTokenizerTrainer tempThis = this;
        new Thread(() -> {
            tempThis.setVisible(false);
            int returnval = myFileChooser.showOpenDialog(tempThis);
            if (returnval == JFileChooser.APPROVE_OPTION) {
                final File selectedFile = myFileChooser.getSelectedFile();
                InputStream DictionaryInputStream = null;
                try {
                    DictionaryInputStream = new java.io.BufferedInputStream(new FileInputStream(selectedFile));
                    mAbbreviationDictionary = new Dictionary(DictionaryInputStream);
                } catch (IOException ex) {
                    Logger.getLogger(DictionaryEditor.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (DictionaryInputStream != null) {
                        try {
                            DictionaryInputStream.close();
                        } catch (IOException ex) {
                            Logger.getLogger(DictionaryEditor.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                }
            }
            tempThis.setVisible(true);
        }).start();
    }//GEN-LAST:event_cmdLoadAbbreviationDictionaryActionPerformed

    private void cmdCreateSpellingDictionaryActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdCreateSpellingDictionaryActionPerformed
        final WordSplitingTokenizerTrainer tempThis = this;
        new Thread(() -> {
            final DictionaryEditor newDictionaryEditor = new DictionaryEditor();
            newDictionaryEditor.runOnClose(() -> {
                mSpellingDictionary = newDictionaryEditor.getCreatedObject();
                tempThis.setVisible(true);
            });
            tempThis.setVisible(false);
            newDictionaryEditor.setVisible(true);
        }).start();
    }//GEN-LAST:event_cmdCreateSpellingDictionaryActionPerformed

    private void cmdLoadSpellingDictionaryActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdLoadSpellingDictionaryActionPerformed
        final WordSplitingTokenizerTrainer tempThis = this;
        new Thread(() -> {
            tempThis.setVisible(false);
            int returnval = myFileChooser.showOpenDialog(tempThis);
            if (returnval == JFileChooser.APPROVE_OPTION) {
                final File selectedFile = myFileChooser.getSelectedFile();
                InputStream DictionaryInputStream = null;
                try {
                    DictionaryInputStream = new java.io.BufferedInputStream(new FileInputStream(selectedFile));
                    mSpellingDictionary = new Dictionary(DictionaryInputStream);
                } catch (IOException ex) {
                    Logger.getLogger(DictionaryEditor.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (DictionaryInputStream != null) {
                        try {
                            DictionaryInputStream.close();
                        } catch (IOException ex) {
                            Logger.getLogger(DictionaryEditor.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                }
            }
            tempThis.setVisible(true);
        }).start();
    }//GEN-LAST:event_cmdLoadSpellingDictionaryActionPerformed

    private void cmdLoadTrainingFileActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdLoadTrainingFileActionPerformed
        final WordSplitingTokenizerTrainer tempThis = this;
        new Thread(() -> {
            tempThis.setVisible(false);
            int returnval = myFileChooser.showOpenDialog(tempThis);
            if (returnval == JFileChooser.APPROVE_OPTION) {
                final File selectedFile = myFileChooser.getSelectedFile();
                mFileCollectionListModel.add(selectedFile);
            }
            tempThis.setVisible(true);
        }).start();
    }//GEN-LAST:event_cmdLoadTrainingFileActionPerformed

    private void cmdDeleteTrainingFileActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdDeleteTrainingFileActionPerformed
        mFileCollectionListModel.remove(listFiles.getSelectedValue());
    }//GEN-LAST:event_cmdDeleteTrainingFileActionPerformed

    private void cmdTrainActionPerformed(java.awt.event.ActionEvent evt) {//GEN-FIRST:event_cmdTrainActionPerformed
        final WordSplitingTokenizerTrainer tempThis = this;
        final Charset utf8 = Charset.forName("UTF-8");
        new Thread(() -> {
            textTestResults.setText("");
            //create TokenizerFactory part of the training context
            WordSplittingTokenizerFactory myTokenizerFactory = new WordSplittingTokenizerFactory("EN",
                    mAbbreviationDictionary, false, null, mSpellingDictionary,
                    (TimeComplexity) comboTimeComplexity.getSelectedItem());

            Tokenizer stdTokenizer = null;
            try {
                stdTokenizer = OpenNLPUtils.createTokenizer();
            } catch (IOException ex) {
                Logger.getLogger(WordSplitingTokenizerTrainer.class.getName()).log(Level.SEVERE, null, ex);
            }
            Tokenizer myNonSplitingTokenizer = null;
            try {
                myNonSplitingTokenizer = OpenNLPUtils.createTokenizer(OpenNLPUtils.readTokenizerModel(
                        OpenNLPUtils.buildModelFileStream(".\\data\\OpenNLP\\en-fiction-token.bin")));
            } catch (IOException ex) {
                Logger.getLogger(WordSplitingTokenizerTrainer.class.getName()).log(Level.SEVERE, null, ex);
            }
            List<FileSplit> FileSplits = FileSplit.generateFileSplitsLOO(mFileCollectionListModel);
            File trainingFile = new File("en-token.train");
            File testFile = new File("en-token.test");
            SummaryStatistics curFStats = new SummaryStatistics();
            SummaryStatistics curRecallStats = new SummaryStatistics();
            SummaryStatistics curPrecisionStats = new SummaryStatistics();
            SummaryStatistics stdFStats = new SummaryStatistics();
            SummaryStatistics stdRecallStats = new SummaryStatistics();
            SummaryStatistics stdPrecisionStats = new SummaryStatistics();
            SummaryStatistics myNonSplitFStats = new SummaryStatistics();
            SummaryStatistics myNonSplitRecallStats = new SummaryStatistics();
            SummaryStatistics myNonSplitPrecisionStats = new SummaryStatistics();
            java.io.BufferedWriter trainingFileWriter = null;
            for (FileSplit curFileSplit : FileSplits) {
                try {
                    //create training file
                    trainingFileWriter = new java.io.BufferedWriter(
                            new java.io.OutputStreamWriter(new java.io.FileOutputStream(trainingFile), utf8));
                    for (File curTrainingFile : curFileSplit.getTrainingFiles()) {
                        java.io.BufferedReader curTrainingFileReader = null;
                        try {
                            Charset fileCharset = FileUtils.determineCharset(curTrainingFile);
                            if (fileCharset == null) {
                                fileCharset = utf8;
                            }
                            curTrainingFileReader = new java.io.BufferedReader(new java.io.InputStreamReader(
                                    new java.io.FileInputStream(curTrainingFile), fileCharset));
                            while (curTrainingFileReader.ready()) {
                                String curLine = curTrainingFileReader.readLine();
                                trainingFileWriter.append(curLine).append("\n");
                            }
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        } finally {
                            if (curTrainingFileReader != null) {
                                curTrainingFileReader.close();
                            }
                        }
                    }
                    trainingFileWriter.write('\n');
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (trainingFileWriter != null) {
                        try {
                            trainingFileWriter.close();
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                }
                //create test file
                java.io.BufferedWriter testFileWriter = null;
                try {
                    //create training file
                    testFileWriter = new java.io.BufferedWriter(
                            new java.io.OutputStreamWriter(new java.io.FileOutputStream(testFile), utf8));
                    for (File curTrainingFile : curFileSplit.getTestFiles()) {
                        String testingFileName = curTrainingFile.getCanonicalPath();
                        textTestResults
                                .setText(textTestResults.getText() + "testing with " + testingFileName + "\n");
                        java.io.BufferedReader curTrainingFileReader = null;
                        try {
                            Charset fileCharset = FileUtils.determineCharset(curTrainingFile);
                            if (fileCharset == null) {
                                fileCharset = utf8;
                            }
                            curTrainingFileReader = new java.io.BufferedReader(new java.io.InputStreamReader(
                                    new java.io.FileInputStream(curTrainingFile), fileCharset));
                            while (curTrainingFileReader.ready()) {
                                String curLine = curTrainingFileReader.readLine();
                                testFileWriter.append(curLine).append("\n");
                            }
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        } finally {
                            if (curTrainingFileReader != null) {
                                curTrainingFileReader.close();
                            }
                        }
                    }
                    testFileWriter.write('\n');
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (testFileWriter != null) {
                        try {
                            testFileWriter.close();
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                }
                //create and train model
                ObjectStream<String> trainingLineStream = null;
                TokenizerModel train = null;
                try {
                    trainingLineStream = new PlainTextByLineStream(new FileInputStream(trainingFile), utf8);
                    ObjectStream<TokenSample> sampleStream = null;
                    try {
                        sampleStream = new TokenSampleStream(trainingLineStream);
                        train = TokenizerME.train(sampleStream, myTokenizerFactory,
                                TrainingParameters.defaultParams());
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    } finally {
                        if (sampleStream != null) {
                            try {
                                sampleStream.close();
                            } catch (IOException ex) {
                                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null,
                                        ex);
                            }
                        }
                    }
                } catch (FileNotFoundException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (trainingLineStream != null) {
                        try {
                            trainingLineStream.close();
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                }
                if (train != null) {
                    ObjectStream<String> testingLineStream = null;
                    try {
                        testingLineStream = new PlainTextByLineStream(new FileInputStream(testFile), utf8);
                        ObjectStream<TokenSample> sampleStream = null;
                        try {
                            sampleStream = new TokenSampleStream(testingLineStream);
                            TokenizerME testDetector = new TokenizerME(train);
                            TokenizerEvaluator evaluator = new TokenizerEvaluator(testDetector);
                            evaluator.evaluate(sampleStream);
                            FMeasure testFMeasure = evaluator.getFMeasure();
                            curFStats.addValue(testFMeasure.getFMeasure());
                            curRecallStats.addValue(testFMeasure.getRecallScore());
                            curPrecisionStats.addValue(testFMeasure.getPrecisionScore());
                            textTestResults.setText(textTestResults.getText() + testFMeasure.getFMeasure() + " "
                                    + testFMeasure.getPrecisionScore() + " " + testFMeasure.getRecallScore()
                                    + "\n");
                            if (stdTokenizer != null) {
                                testingLineStream = new PlainTextByLineStream(new FileInputStream(testFile), utf8);
                                sampleStream = new TokenSampleStream(testingLineStream);
                                TokenizerEvaluator stdEvaluator = new TokenizerEvaluator(stdTokenizer);
                                stdEvaluator.evaluate(sampleStream);
                                FMeasure stdFMeasure = stdEvaluator.getFMeasure();
                                stdFStats.addValue(stdFMeasure.getFMeasure());
                                stdRecallStats.addValue(stdFMeasure.getRecallScore());
                                stdPrecisionStats.addValue(stdFMeasure.getPrecisionScore());
                                textTestResults.setText(textTestResults.getText() + " " + stdFMeasure.getFMeasure()
                                        + " " + stdFMeasure.getPrecisionScore() + " " + stdFMeasure.getRecallScore()
                                        + "\n");
                            }
                            if (myNonSplitingTokenizer != null) {
                                testingLineStream = new PlainTextByLineStream(new FileInputStream(testFile), utf8);
                                sampleStream = new TokenSampleStream(testingLineStream);
                                TokenizerEvaluator myNonSplitingEvaluator = new TokenizerEvaluator(
                                        myNonSplitingTokenizer);
                                myNonSplitingEvaluator.evaluate(sampleStream);
                                FMeasure myNonSplitFMeasure = myNonSplitingEvaluator.getFMeasure();
                                myNonSplitFStats.addValue(myNonSplitFMeasure.getFMeasure());
                                myNonSplitRecallStats.addValue(myNonSplitFMeasure.getRecallScore());
                                myNonSplitPrecisionStats.addValue(myNonSplitFMeasure.getPrecisionScore());
                                textTestResults
                                        .setText(textTestResults.getText() + " " + myNonSplitFMeasure.getFMeasure()
                                                + " " + myNonSplitFMeasure.getPrecisionScore() + " "
                                                + myNonSplitFMeasure.getRecallScore() + "\n");
                            }
                            textTestResults.setText(textTestResults.getText() + "\n");
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        } finally {
                            if (sampleStream != null) {
                                try {
                                    sampleStream.close();
                                } catch (IOException ex) {
                                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE,
                                            null, ex);
                                }
                            }
                        }
                    } catch (FileNotFoundException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    } finally {
                        if (testingLineStream != null) {
                            try {
                                testingLineStream.close();
                            } catch (IOException ex) {
                                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null,
                                        ex);
                            }
                        }
                    }
                }
            }
            textTestResults.setText(textTestResults.getText() + "\n");
            textTestResults.setText(textTestResults.getText() + "test model\n");
            textTestResults.setText(textTestResults.getText() + "f score mean " + curFStats.getMean() + " stdDev "
                    + curFStats.getStandardDeviation() + "\n");
            textTestResults.setText(textTestResults.getText() + "recall mean " + curRecallStats.getMean()
                    + " stdDev " + curRecallStats.getStandardDeviation() + "\n");
            textTestResults.setText(textTestResults.getText() + "precision score mean "
                    + curPrecisionStats.getMean() + " stdDev " + curPrecisionStats.getStandardDeviation() + "\n");
            textTestResults.setText(textTestResults.getText() + "std model\n");
            textTestResults.setText(textTestResults.getText() + "f score mean " + stdFStats.getMean() + " stdDev "
                    + stdFStats.getStandardDeviation() + "\n");
            textTestResults.setText(textTestResults.getText() + "recall mean " + stdRecallStats.getMean()
                    + " stdDev " + stdRecallStats.getStandardDeviation() + "\n");
            textTestResults.setText(textTestResults.getText() + "precision score mean "
                    + stdPrecisionStats.getMean() + " stdDev " + stdPrecisionStats.getStandardDeviation() + "\n");
            textTestResults.setText(textTestResults.getText() + "my non spliting model\n");
            textTestResults.setText(textTestResults.getText() + "f score mean " + myNonSplitFStats.getMean()
                    + " stdDev " + myNonSplitFStats.getStandardDeviation() + "\n");
            textTestResults.setText(textTestResults.getText() + "recall mean " + myNonSplitRecallStats.getMean()
                    + " stdDev " + myNonSplitRecallStats.getStandardDeviation() + "\n");
            textTestResults.setText(
                    textTestResults.getText() + "precision score mean " + myNonSplitPrecisionStats.getMean()
                            + " stdDev " + myNonSplitPrecisionStats.getStandardDeviation() + "\n");
            //create combinded training file
            trainingFileWriter = null;
            try {
                trainingFileWriter = new java.io.BufferedWriter(
                        new java.io.OutputStreamWriter(new java.io.FileOutputStream(trainingFile), utf8));
                for (File curTrainingFile : mFileCollectionListModel) {
                    java.io.BufferedReader curTrainingFileReader = null;
                    try {
                        Charset fileCharset = FileUtils.determineCharset(curTrainingFile);
                        if (fileCharset == null) {
                            fileCharset = utf8;
                        }
                        curTrainingFileReader = new java.io.BufferedReader(new java.io.InputStreamReader(
                                new java.io.FileInputStream(curTrainingFile), fileCharset));
                        while (curTrainingFileReader.ready()) {
                            String curLine = curTrainingFileReader.readLine();
                            trainingFileWriter.append(curLine).append("\n");
                        }
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    } finally {
                        if (curTrainingFileReader != null) {
                            curTrainingFileReader.close();
                        }
                    }
                }
                trainingFileWriter.write('\n');
            } catch (IOException ex) {
                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                if (trainingFileWriter != null) {
                    try {
                        trainingFileWriter.close();
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
            //create and train model
            ObjectStream<String> lineStream = null;
            this.createdObject = null;
            try {
                lineStream = new PlainTextByLineStream(new FileInputStream(trainingFile), utf8);
                ObjectStream<TokenSample> sampleStream = null;
                try {
                    sampleStream = new TokenSampleStream(lineStream);
                    this.createdObject = TokenizerME.train(sampleStream, myTokenizerFactory,
                            TrainingParameters.defaultParams());
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (sampleStream != null) {
                        try {
                            sampleStream.close();
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                }
            } catch (FileNotFoundException ex) {
                Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                if (lineStream != null) {
                    try {
                        lineStream.close();
                    } catch (IOException ex) {
                        Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                    }
                }
            }
            if (createdObject != null) {
                OutputStream modelOut = null;
                File modelFile = new File("en-fiction-token.bin");
                try {
                    modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
                    createdObject.serialize(modelOut);
                } catch (IOException ex) {
                    Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                } finally {
                    if (modelOut != null) {
                        try {
                            modelOut.close();
                        } catch (IOException ex) {
                            Logger.getLogger(SentenceDetectorTrainer.class.getName()).log(Level.SEVERE, null, ex);
                        }
                    }
                }
            }
            textTestResults.setText(textTestResults.getText() + "done");
        }).start();
    }//GEN-LAST:event_cmdTrainActionPerformed

    /**
     * @param args the command line arguments
     */
    public static void main(String args[]) {
        /* Set the Nimbus look and feel */
        //<editor-fold defaultstate="collapsed" desc=" Look and feel setting code (optional) ">
        /* If Nimbus (introduced in Java SE 6) is not available, stay with the default look and feel.
         * For details see http://download.oracle.com/javase/tutorial/uiswing/lookandfeel/plaf.html 
         */
        try {
            for (javax.swing.UIManager.LookAndFeelInfo info : javax.swing.UIManager.getInstalledLookAndFeels()) {
                if ("Nimbus".equals(info.getName())) {
                    javax.swing.UIManager.setLookAndFeel(info.getClassName());
                    break;
                }
            }
        } catch (ClassNotFoundException ex) {
            java.util.logging.Logger.getLogger(WordSplitingTokenizerTrainer.class.getName())
                    .log(java.util.logging.Level.SEVERE, null, ex);
        } catch (InstantiationException ex) {
            java.util.logging.Logger.getLogger(WordSplitingTokenizerTrainer.class.getName())
                    .log(java.util.logging.Level.SEVERE, null, ex);
        } catch (IllegalAccessException ex) {
            java.util.logging.Logger.getLogger(WordSplitingTokenizerTrainer.class.getName())
                    .log(java.util.logging.Level.SEVERE, null, ex);
        } catch (javax.swing.UnsupportedLookAndFeelException ex) {
            java.util.logging.Logger.getLogger(WordSplitingTokenizerTrainer.class.getName())
                    .log(java.util.logging.Level.SEVERE, null, ex);
        }
        //</editor-fold>
        //</editor-fold>

        /* Create and display the form */
        java.awt.EventQueue.invokeLater(new Runnable() {
            public void run() {
                new WordSplitingTokenizerTrainer().setVisible(true);
            }
        });
    }

    // Variables declaration - do not modify//GEN-BEGIN:variables
    private javax.swing.JButton cmdCreateAbbreviationDictionary;
    private javax.swing.JButton cmdCreateSpellingDictionary;
    private javax.swing.JButton cmdDeleteTrainingFile;
    private javax.swing.JButton cmdLoadAbbreviationDictionary;
    private javax.swing.JButton cmdLoadSpellingDictionary;
    private javax.swing.JButton cmdLoadTrainingFile;
    private javax.swing.JButton cmdTrain;
    private javax.swing.JComboBox<TimeComplexity> comboTimeComplexity;
    private javax.swing.JScrollPane jScrollPane1;
    private javax.swing.JScrollPane jScrollPane2;
    private javax.swing.JList<File> listFiles;
    private javax.swing.JFileChooser myFileChooser;
    private javax.swing.JTextArea textTestResults;
    // End of variables declaration//GEN-END:variables
}