ErrorCorrection.java Source code

Introduction

Here is the source code for ErrorCorrection.java
Source

/*
 * To change this license header, choose License Headers in Project Properties.
 * To change this template file, choose Tools | Templates
 * and open the template in the editor.
 */

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.PTBTokenizer;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.trees.Tree;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 *
 * @author Majisha
 */
public class ErrorCorrection {

    /**
     * @param args the command line arguments
     */

    private final static String PCG_MODEL = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";

    private static final TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer
            .factory(new CoreLabelTokenFactory(), "invertible=true");

    private static final LexicalizedParser parser = LexicalizedParser.loadModel(PCG_MODEL);

    private static Tree parse(String str) {
        List<CoreLabel> tokens = tokenize(str);
        Tree tree = parser.apply(tokens);
        return tree;
    }

    private static List<CoreLabel> tokenize(String str) {
        Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(str));
        return tokenizer.tokenize();
    }

    public static void main(String[] args) throws IOException {
        // TODO code application logic here

        File inputFile = new File(args[0]);
        FileWriter outFile = new FileWriter(args[1], true);

        Pattern p_to = Pattern.compile("\\bto\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_too = Pattern.compile("\\btoo\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_its = Pattern.compile("\\bits\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_it_s = Pattern.compile("\\bit\\s*'s\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_lose = Pattern.compile("\\blose\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_loose = Pattern.compile("\\bloose\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_their = Pattern.compile("\\btheir\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_theyre = Pattern.compile("\\bthey\\s*'re\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_your = Pattern.compile("\\byour\\b", Pattern.CASE_INSENSITIVE);
        Pattern p_youre = Pattern.compile("\\byou\\s*'re\\b", Pattern.CASE_INSENSITIVE);

        BufferedReader in = new BufferedReader(new FileReader(inputFile));
        PrintWriter out = new PrintWriter(outFile);

        String str;
        HashSet<String> generatedStrings = new HashSet();
        List<String> list = new ArrayList<String>();
        while ((str = in.readLine()) != null) {
            list.add(str);
        }

        for (String l : list) {
            if (l.length() != 0) {

                generatedStrings = new HashSet();

                generatedStrings.add(l);
                List<String> new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_to.matcher(l);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "Too";
                        } else {
                            word = "too";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_too.matcher(gl);
                    int begin = 0;
                    int end = 0;
                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "To";
                        } else {
                            word = "to";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_its.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "It's";
                        } else {
                            word = "it's";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_it_s.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "Its";
                        } else {
                            word = "its";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_lose.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "Loose";
                        } else {
                            word = "loose";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_loose.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "Lose";
                        } else {
                            word = "lose";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_their.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "They're";
                        } else {
                            word = "they're";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_theyre.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "Their";
                        } else {
                            word = "their";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_your.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "You're";
                        } else {
                            word = "you're";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);
                new_strings = new ArrayList();
                for (String gl : generatedStrings) {

                    Matcher m = p_youre.matcher(gl);
                    int begin = 0;
                    int end = 0;

                    String word = "";
                    while (m.find()) {
                        end = m.start();
                        char fc = m.group().charAt(0);
                        if (Character.isUpperCase(fc)) {
                            word = "Your";
                        } else {
                            word = "your";
                        }
                        String new_string = gl.substring(begin, end) + word + gl.substring(m.end(), gl.length());
                        new_strings.add(new_string);

                    }
                }

                generatedStrings.addAll(new_strings);

                double score = 0.0;
                double best = 0.0;
                String best_line = "";
                int i = 0;
                for (String gl : generatedStrings) {

                    Tree tr = parse(gl);
                    score = tr.score();
                    if (i == 0) {
                        best = score;
                        best_line = gl;
                    } else {
                        if (best < score) {
                            best = score;
                            best_line = gl;
                        }
                    }

                    i++;

                }

                out.println(best_line);
            } else {

                out.println(l);
            }

        }

        out.close();

    }

}