org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.WikipediaMarkupDeletingDecorator.java Source code

Introduction

Here is the source code for org.dice_research.topicmodeling.preprocessing.docsupplier.decorator.WikipediaMarkupDeletingDecorator.java
Source

/**
 * This file is part of topicmodeling.preprocessing.
 *
 * topicmodeling.preprocessing is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * topicmodeling.preprocessing is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with topicmodeling.preprocessing.  If not, see <http://www.gnu.org/licenses/>.
 */
package org.dice_research.topicmodeling.preprocessing.docsupplier.decorator;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringEscapeUtils;
import org.dice_research.topicmodeling.preprocessing.docsupplier.DocumentSupplier;
import org.dice_research.topicmodeling.utils.doc.Document;
import org.dice_research.topicmodeling.utils.doc.DocumentText;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@Deprecated
public class WikipediaMarkupDeletingDecorator extends AbstractDocumentSupplierDecorator {

    private static final Logger LOGGER = LoggerFactory.getLogger(WikipediaMarkupDeletingDecorator.class);

    public WikipediaMarkupDeletingDecorator(DocumentSupplier documentSource) {
        super(documentSource);
    }

    @Override
    public Document prepareDocument(Document document) {
        DocumentText text = document.getProperty(DocumentText.class);
        if (text == null) {
            LOGGER.error("Got a Document without a DocumentText property!");
            return null;
        }
        text.setText(removeWikiMarkup(text.getText()));
        return document;
    }

    private String removeWikiMarkup(String text) {
        String cleanText = text;
        if (cleanText.startsWith("#")) {
            String temp = cleanText.substring(0, cleanText.length() < 14 ? cleanText.length() : 14);
            temp = temp.toLowerCase();
            if (temp.startsWith("#redirect")) {
                cleanText = cleanText.substring(9);
            } else if (temp.startsWith("#weiterleitung")) {
                cleanText = cleanText.substring(14);
            }
        }

        // makros
        cleanText = cleanTagDeleteContent("\\{\\{", "\\}\\}", cleanText);
        // internal links
        cleanText = cleanTagRetainContent("\\[\\[[^\\]]*\\|", "\\]\\]", cleanText);
        cleanText = cleanTagRetainContent("\\[\\[", "\\]\\]", cleanText);
        // external links
        cleanText = cleanTagRetainContent("\\[[^\\]]* ", "\\]", cleanText);
        cleanText = cleanTagRetainContent("\\[", "\\]", cleanText);
        // comments
        cleanText = cleanTagDeleteContent("<!--", "-->", cleanText, " ");

        // all types of tags
        cleanText = cleanTagRetainContent("<[^>]*>", "</[^>]*>", cleanText, " ");
        // headlines
        cleanText = cleanTagRetainContent("([=]{2,6})", "([=]{2,6})", cleanText);
        // formatings
        cleanText = cleanTagRetainContent("([']{2,5})", "([']{2,5})", cleanText);

        // all types of lists (bullet list, numbered list, definition list, indent text)
        cleanText = cleanTag("\\n[*#;:]+", cleanText, "\n");
        // cleanText = cleanTag("\\n[#]+", cleanText, "\n");
        // horizontal rule
        cleanText = cleanTag("----", cleanText);

        // tables starting with "{| text to be deleted"
        cleanText = cleanTag("\\n\\{\\|[^\\n]*\\n", cleanText, "\n");
        // "! text to be deleted | text to be retained \n"
        cleanText = cleanTag("\\n\\![^\\|\\n]*\\|", cleanText, "\n");
        // "! text to be deleted \n"
        cleanText = cleanTag("\\n\\![^\\|\\n]*", cleanText, "\n");
        // end of table
        cleanText = cleanTag("\\n\\|\\}", cleanText, "\n");
        // new table line
        cleanText = cleanTag("\\n\\|-[^\\n]*\\n", cleanText, "\n");
        // new tabel cell
        cleanText = cleanTag("\\n\\|", cleanText, "\n");

        // unescape symbols which are written with HTML escaping
        cleanText = unescapeSymbols(cleanText);

        return cleanText;
    }

    private static String cleanTagDeleteContent(final String beginPattern, final String endPattern,
            final String text) {
        return cleanTagDeleteContent(beginPattern, endPattern, text, null);
    }

    private static String cleanTagDeleteContent(final String beginPattern, final String endPattern,
            final String text, final String replacement) {

        String s = "(" + beginPattern + ")|(" + endPattern + ")";
        Pattern pat = Pattern.compile(s);
        Matcher matchRef = pat.matcher(text);

        String cleanText = "";
        String rep = (replacement == null) ? "" : replacement;

        int isRef = 0;
        int refBegin = 0;
        int refEnd = 0;

        while (matchRef.find()) {
            if (matchRef.group().matches(beginPattern)) {
                // System.out.println("found a " + beginPattern + "!!");
                if (isRef == 0) {
                    refEnd = matchRef.start();
                    cleanText += text.substring(refBegin, refEnd) + rep;
                }
                isRef++;
            }
            if (matchRef.group().matches(endPattern)) {
                // System.out.println("found a " + endPattern + "!!");
                isRef--;
                if (isRef == 0) {
                    refBegin = matchRef.end();
                }
            }
        }
        if (matchRef.hitEnd()) {
            cleanText += text.substring(refBegin, text.length());
        }
        return cleanText;
    }

    private static String cleanTag(final String pattern, final String text) {
        return cleanTagRetainContent(pattern, null, text);
    }

    private static String cleanTag(final String pattern, final String text, final String replacement) {
        return cleanTagRetainContent(pattern, null, text, replacement);
    }

    private static String cleanTagRetainContent(final String beginPattern, final String endPattern,
            final String text) {
        return cleanTagRetainContent(beginPattern, endPattern, text, null);
    }

    private static String cleanTagRetainContent(final String beginPattern, final String endPattern,
            final String text, final String replacement) {
        String rep = (replacement == null) ? "" : replacement;
        String cleanText = text;
        if (beginPattern != null) {
            cleanText = cleanText.replaceAll(beginPattern, rep);
        }
        if (endPattern != null) {
            cleanText = cleanText.replaceAll(endPattern, rep);
        }
        return cleanText;
    }

    private static String unescapeSymbols(final String text) {
        Pattern pat = Pattern.compile("(&[#\\p{Alnum}][\\p{Alnum}]*;)");
        Matcher matchRef = pat.matcher(text);
        StringBuilder cleanText = new StringBuilder();

        int textPos = 0;

        while (matchRef.find()) {
            cleanText.append(text.substring(textPos, matchRef.start()));
            cleanText.append(StringEscapeUtils.unescapeHtml4(text.substring(matchRef.start(), matchRef.end())));
            textPos = matchRef.end();
        }
        cleanText.append(text.substring(textPos));
        return cleanText.toString();
    }
}