de.tudarmstadt.ukp.csniper.resbuild.EvaluationItemFixer.java Source code

Java tutorial

Introduction

Here is the source code for de.tudarmstadt.ukp.csniper.resbuild.EvaluationItemFixer.java

Source

/*******************************************************************************
 * Copyright 2013
 * Ubiquitous Knowledge Processing (UKP) Lab
 * Technische Universitt Darmstadt
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 ******************************************************************************/
package de.tudarmstadt.ukp.csniper.resbuild;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;

import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

import de.tudarmstadt.ukp.csniper.webapp.evaluation.model.Corpus;
import de.tudarmstadt.ukp.csniper.webapp.search.CorpusService;
import de.tudarmstadt.ukp.csniper.webapp.search.SearchEngine;
import de.tudarmstadt.ukp.csniper.webapp.search.cqp.ContextUnit;
import de.tudarmstadt.ukp.csniper.webapp.search.cqp.CqpEngine;
import de.tudarmstadt.ukp.csniper.webapp.search.cqp.CqpQuery;

public class EvaluationItemFixer {
    private static final Log log = LogFactory.getLog(EvaluationItemFixer.class);

    private static final String HOST = "jdbc:mysql://loewe-ncc.ukp.informatik.tu-darmstadt.de/";
    private static final String DATABASE = "csniper";
    private static final String USER = "csniper";
    private static final String PASSWORD = "csniper";

    // private static final String CQP_EXECUTABLE = "/opt/imscwb/cqp";
    // private static final String CORPUS_REPOSITORY = "/srv/csniper";

    // private static final String HOST = "jdbc:mysql://127.0.0.1/";
    // private static final String DATABASE = "csniper";
    // private static final String USER = "root";
    // private static final String PASSWORD = "gugaguga";

    private static final String CQP_EXECUTABLE = "D:\\ukp\\cwb-3.4.3\\bin\\cqp.exe";
    private static final String REPOSITORY = "D:\\ukp\\data\\csniper";

    private static final String LOG_SUCCESSFUL = "C:\\users\\dodinh\\desktop\\successful.txt";
    private static final String LOG_FAILED = "C:\\users\\dodinh\\desktop\\failed.txt";

    private static final String LRB = "-LRB-";
    private static final String RRB = "-RRB-";

    private static Connection connection;

    private static CqpEngine engine = new CqpEngine() {
        private static final long serialVersionUID = 1L;
        {
            setCorpusService(new CorpusService() {
                @Override
                public List<SearchEngine> listEngines(String aCorpusId) {
                    return null;
                }

                @Override
                public List<String> listCorpora() {
                    return null;
                }

                @Override
                public File getRepositoryPath() {
                    return new File(REPOSITORY);
                }

                @Override
                public Corpus getCorpus(String aCorpusId) {
                    return null;
                }
            });
            setCqpExecutable(new File(CQP_EXECUTABLE));
        }
    };

    public static void main(String[] args) {
        connect(HOST, DATABASE, USER, PASSWORD);

        Map<Integer, String> items = new HashMap<Integer, String>();
        Map<Integer, String> failed = new HashMap<Integer, String>();

        // fetch coveredTexts of dubious items and clean it
        PreparedStatement select = null;
        try {
            StringBuilder selectQuery = new StringBuilder();
            selectQuery.append("SELECT * FROM EvaluationItem ");
            selectQuery.append("WHERE LOCATE(coveredText, '  ') > 0 ");
            selectQuery.append("OR LOCATE('" + LRB + "', coveredText) > 0 ");
            selectQuery.append("OR LOCATE('" + RRB + "', coveredText) > 0 ");
            selectQuery.append("OR LEFT(coveredText, 1) = ' ' ");
            selectQuery.append("OR RIGHT(coveredText, 1) = ' ' ");

            select = connection.prepareStatement(selectQuery.toString());
            log.info("Running query [" + selectQuery.toString() + "].");
            ResultSet rs = select.executeQuery();

            while (rs.next()) {
                int id = rs.getInt("id");
                String coveredText = rs.getString("coveredText");

                try {
                    // special handling of double whitespace: in this case, re-fetch the text
                    if (coveredText.contains("  ")) {
                        coveredText = retrieveCoveredText(rs.getString("collectionId"), rs.getString("documentId"),
                                rs.getInt("beginOffset"), rs.getInt("endOffset"));
                    }

                    // replace bracket placeholders and trim the text
                    coveredText = StringUtils.replace(coveredText, LRB, "(");
                    coveredText = StringUtils.replace(coveredText, RRB, ")");
                    coveredText = coveredText.trim();

                    items.put(id, coveredText);
                } catch (IllegalArgumentException e) {
                    failed.put(id, e.getMessage());
                }
            }
        } catch (SQLException e) {
            log.error("Exception while selecting: " + e.getMessage());
        } finally {
            closeQuietly(select);
        }

        // write logs
        BufferedWriter bwf = null;
        BufferedWriter bws = null;
        try {
            bwf = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(LOG_FAILED)), "UTF-8"));
            for (Entry<Integer, String> e : failed.entrySet()) {
                bwf.write(e.getKey() + " - " + e.getValue() + "\n");
            }

            bws = new BufferedWriter(
                    new OutputStreamWriter(new FileOutputStream(new File(LOG_SUCCESSFUL)), "UTF-8"));
            for (Entry<Integer, String> e : items.entrySet()) {
                bws.write(e.getKey() + " - " + e.getValue() + "\n");
            }
        } catch (IOException e) {
            log.error("Got an IOException while writing the log files.");
        } finally {
            IOUtils.closeQuietly(bwf);
            IOUtils.closeQuietly(bws);
        }

        log.info("Texts for [" + items.size() + "] items need to be cleaned up.");

        // update the dubious items with the cleaned coveredText
        PreparedStatement update = null;
        try {
            String updateQuery = "UPDATE EvaluationItem SET coveredText = ? WHERE id = ?";

            update = connection.prepareStatement(updateQuery);
            int i = 0;
            for (Entry<Integer, String> e : items.entrySet()) {
                int id = e.getKey();
                String coveredText = e.getValue();

                // update item in database
                update.setString(1, coveredText);
                update.setInt(2, id);
                update.executeUpdate();
                log.debug("Updating " + id + " with [" + coveredText + "]");

                // show percentage of updated items
                i++;
                int part = (int) Math.ceil((double) items.size() / 100);
                if (i % part == 0) {
                    log.info(i / part + "% finished (" + i + "/" + items.size() + ").");
                }
            }
        } catch (SQLException e) {
            log.error("Exception while updating: " + e.getMessage());
        } finally {
            closeQuietly(update);
        }

        closeQuietly(connection);
    }

    private static void connect(String aHost, String aDatabase, String aUser, String aPassword) {
        String url = aHost + aDatabase + "?user=" + aUser + "&password=" + aPassword;

        try {
            Class.forName("com.mysql.jdbc.Driver");
            connection = DriverManager.getConnection(url);
        } catch (ClassNotFoundException e) {
            throw new RuntimeException("Failed to load the specified database driver.", e);
        } catch (SQLException e) {
            throw new RuntimeException("There was an unrecoverable error while connecting to the database.", e);
        }
    }

    /**
     * Using a workaround for java < 1.7.
     */
    private static void closeQuietly(Object aAutoCloseable) {
        try {
            if (aAutoCloseable != null) {
                if (aAutoCloseable instanceof Connection) {
                    ((Connection) aAutoCloseable).close();
                }
                if (aAutoCloseable instanceof Statement) {
                    ((Statement) aAutoCloseable).close();
                }
            }
        } catch (Exception e) {
            log.error("There was an unrecoverable error while closing [" + aAutoCloseable + "].", e);
        }
    }

    private static String retrieveCoveredText(final String aCollectionId, String aDocumentId, int aBeginOffset,
            int aEndOffset) {
        String coveredText;

        CqpQuery query = new CqpQuery(engine, "", aCollectionId);
        query.setContext(0, 0, ContextUnit.CHARACTER);
        String queryString = "[begin=\"" + aBeginOffset + "\"] []* [end=\"" + aEndOffset + "\"] :: match.text_id=\""
                + aDocumentId + "\"";
        log.trace(queryString);

        query.runQuery(queryString);

        if (query.size() > 1) {
            log.warn("More than 1 entry found: " + query);
        }
        if (query.size() == 0) {
            log.error("Nothing found for: " + queryString);
            query.close();
            throw new IllegalArgumentException(queryString);
        }
        coveredText = query.cat(1).get(0).getCoveredText();
        query.close();

        return coveredText;
    }
}