de.l3s.content.timex.extracting.utils.DateUtil.java Source code

Java tutorial

Introduction

Here is the source code for de.l3s.content.timex.extracting.utils.DateUtil.java

Source

package de.l3s.content.timex.extracting.utils;

/*
 * TIMETool - Large-scale Temporal Search in MapReduce
 *
 * Licensed under the Apache License, Version 2.0 (the "License"); you
 * may not use this file except in compliance with the License. You may
 * obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied. See the License for the specific language governing
 * permissions and limitations under the License.
 */

/*
 * THIS SOFTWARE IS PROVIDED BY THE LEMUR PROJECT AS PART OF THE CLUEWEB09
 * PROJECT AND OTHER CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN
 * NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * @author 
 */
import java.text.DateFormat;
import java.text.ParseException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.joda.time.IllegalFieldValueException;
import org.joda.time.LocalDate;
import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;

import edu.stanford.nlp.util.Pair;

public class DateUtil {
    private static DateFormat full_df = DateFormat.getDateInstance(DateFormat.FULL);
    private static DateFormat medium_df = DateFormat.getDateInstance(DateFormat.MEDIUM);
    private final static DateTimeFormatter dateFormat = DateTimeFormat.forPattern("yyyyMMdd");
    static final String blog_date4 = "(Mon|Tue|Wed|Thu|Fri|Sat|Sun), \\d{4}-\\d{2}-\\d{2}";
    static final String blog_date1 = "(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\s+[0-3]?[0-9],?\\s+[0-2][0-9][0-9][0-9]";
    static final String blog_date2 = "(0?[1-9]|[12][0-9]|3[01])/(0?[1-9]|1[012])/((19|20)\\d\\d)";
    static final String blog_date3 = "^((19|20)\\d\\d)-(0?[1-9]|1[012])-(0?[1-9]|[12][0-9]|3[01])$";
    static final String blog_date0 = "(Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday),?\\s+(January|February|March|April|May|June|July|August|September|October|November|December|Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\s+[0-3]?[0-9],?\\s+[0-2][0-9][0-9][0-9]";
    static final String html_pattern = "<html>.*?</html>";
    String url_regex = "\\(?\\b(http://|www[.])[-A-Za-z0-9+&@#/%?=~_()|!:,.;]*[-A-Za-z0-9+&@#/%=~_()|]";
    static Pattern html_tag = Pattern.compile(html_pattern);

    public DateUtil() {
    }

    /**
     * 
     * @param dateString
     * @return
     */
    public LocalDate extractDateFromContent(String dateString) {
        String extractedDate = "";
        Pattern p = Pattern.compile(blog_date0);
        Matcher date = p.matcher(dateString);
        //extract date from content
        if (date.find()) {
            extractedDate = date.group();
            try {
                return new LocalDate(full_df.parseObject(extractedDate));
            } catch (ParseException pe1) {
                try {
                    extractedDate = extractedDate.replaceAll(", ", " ");
                    extractedDate = extractedDate.replaceFirst(" ", ", ");
                    extractedDate = replaceLast(extractedDate, " 200", ", 200");
                    return new LocalDate(full_df.parseObject(extractedDate));
                } catch (ParseException pe2) {
                    System.out.println(extractedDate + "\n " + pe2.getMessage());
                }
            }
        } else {
            p = Pattern.compile(blog_date1);
            date = p.matcher(dateString);
            if (date.find()) {
                extractedDate = date.group();
                try {
                    return new LocalDate(medium_df.parseObject(extractedDate));
                } catch (ParseException pe1) {
                    try {
                        extractedDate = extractedDate.replaceAll(", ", " ");
                        extractedDate = replaceLast(extractedDate, " 200", ", 200");
                        return new LocalDate(medium_df.parseObject(extractedDate));
                    } catch (ParseException pe2) {
                        System.out.println(extractedDate + "\n " + pe2.getMessage());
                    }
                }

            }
        }

        return null;
    }

    /**
     * 
     * @param content
     * @param url
     * @param docid
     * @return
     * @throws ParseException 
     */
    public LocalDate extractDate(String[] content_lines, String url, String docId) throws ParseException {
        LocalDate extractedUrlDate = null;
        LocalDate extractedDocIdDate = null;
        LocalDate extractedContentDate = null;

        // extract date from content
        extractedContentDate = extractDateFromContent(content_lines[0]);
        if (extractedContentDate == null && content_lines.length > 1)
            extractedContentDate = extractDateFromContent(content_lines[1]);
        if (extractedContentDate == null) {
            //extract date from blog url
            extractedUrlDate = extractDateFromURL(url);
            //extract date from docid
            extractedDocIdDate = LocalDate.parse(docId.substring(7, 15), dateFormat);
            if (extractedUrlDate != null && extractedUrlDate.getMonthOfYear() == extractedDocIdDate.getMonthOfYear()
                    && extractedUrlDate.getYear() == extractedDocIdDate.getYear()
                    && extractedUrlDate.getDayOfMonth() == 15) {
                return extractedDocIdDate;
            }
            //case url contains exact date yyyyMMdd
            else if (extractedUrlDate != null && extractedUrlDate.getDayOfMonth() != 15)
                return extractedUrlDate;
            else if (extractedUrlDate == null)
                return extractedDocIdDate;
            else
                return extractedUrlDate;

        } else {
            return extractedContentDate;
        }
    }

    private static String p1 = "/(January|February|March|April|May|June|July|August|September|October|November|December)/(19|20)\\d{2}/";
    private static String p2 = "/(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/(19|20)\\d{2}/";
    private static String p3 = "/(01|02|03|04|05|06|07|08|09|10|11|12)/(19|20)\\d{2}/"; // month = 01-12
    private static String p4 = "/(1|2|3|4|5|6|7|8|9|10|11|12)/(19|20)\\d{2}/";
    private static String p5 = "/(19|20)\\d{2}/(01|02|03|04|05|06|07|08|09|10|11|12)/";
    private static String p6 = "/(19|20)\\d{2}/(1|2|3|4|5|6|7|8|9|10|11|12)/";
    private static String p7 = "/(19|20)\\d{2}/(01|02|03|04|05|06|07|08|09|10|11|12)/[0-9][0-9]/";
    private static String p8 = "/(19|20)\\d{2}_(01|02|03|04|05|06|07|08|09|10|11|12)_[0-9][0-9]_";
    private static String p9 = "/(19|20)\\d{2}_(01|02|03|04|05|06|07|08|09|10|11|12)_";

    /**
     * 
     * @param url
     * @return
     */
    public static LocalDate extractDateFromURL(String url) {
        LocalDate extractedDate = null;
        if (url.contains("_"))
            url = url.replace("_", "/");
        Pattern p = Pattern.compile(p1);
        Matcher date = p.matcher(url);
        if (date.find()) {
            String[] token = date.group().substring(1).split("/");

            String month = token[0];
            if (month.contains("January") || month.contains("Jan")) {
                month = "01";
            } else if (month.contains("February") || month.contains("Feb")) {
                month = "02";
            } else if (month.contains("March") || month.contains("Mar")) {
                month = "03";
            } else if (month.contains("April") || month.contains("Apr")) {
                month = "04";
            } else if (month.contains("May")) {
                month = "05";
            } else if (month.contains("June") || month.contains("Jun")) {
                month = "06";
            } else if (month.contains("July") || month.contains("Jul")) {
                month = "07";
            } else if (month.contains("August") || month.contains("Aug")) {
                month = "08";
            } else if (month.contains("September") || month.contains("Sep")) {
                month = "09";
            } else if (month.contains("October") || month.contains("Oct")) {
                month = "10";
            } else if (month.contains("November") || month.contains("Nov")) {
                month = "11";
            } else if (month.contains("December") || month.contains("Dec")) {
                month = "12";
            }

            extractedDate = LocalDate.parse((token[1] + month + "15").toString(), dateFormat);
        } else {

            p = Pattern.compile(p2);
            date = p.matcher(url);
            if (date.find()) {
                String[] token = date.group().substring(1).split("/");

                String month = token[0];
                if (month.contains("January") || month.contains("Jan")) {
                    month = "01";
                } else if (month.contains("February") || month.contains("Feb")) {
                    month = "02";
                } else if (month.contains("March") || month.contains("Mar")) {
                    month = "03";
                } else if (month.contains("April") || month.contains("Apr")) {
                    month = "04";
                } else if (month.contains("May")) {
                    month = "05";
                } else if (month.contains("June") || month.contains("Jun")) {
                    month = "06";
                } else if (month.contains("July") || month.contains("Jul")) {
                    month = "07";
                } else if (month.contains("August") || month.contains("Aug")) {
                    month = "08";
                } else if (month.contains("September") || month.contains("Sep")) {
                    month = "09";
                } else if (month.contains("October") || month.contains("Oct")) {
                    month = "10";
                } else if (month.contains("November") || month.contains("Nov")) {
                    month = "11";
                } else if (month.contains("December") || month.contains("Dec")) {
                    month = "12";
                }

                extractedDate = LocalDate.parse((token[1] + month + "15").toString(), dateFormat);
            } else {

                p = Pattern.compile(p3);
                date = p.matcher(url);
                if (date.find()) {
                    String[] token = date.group().substring(1).split("/");
                    try {
                        extractedDate = LocalDate.parse((token[1] + token[0] + "15").toString(), dateFormat);
                    } catch (IllegalFieldValueException e) {
                        return null;
                    }
                } else {

                    p = Pattern.compile(p4);
                    date = p.matcher(url);
                    if (date.find()) {
                        String[] token = date.group().substring(1).split("/");
                        try {
                            extractedDate = LocalDate.parse((token[1] + "0" + token[0] + "15").toString(),
                                    dateFormat);
                        } catch (IllegalFieldValueException e) {
                            return null;
                        }
                    } else {
                        p = Pattern.compile(p7);
                        date = p.matcher(url);
                        if (date.find()) {
                            String[] token = date.group().substring(1).split("/");
                            try {
                                extractedDate = LocalDate.parse(token[0] + token[1] + token[2], dateFormat);
                            } catch (IllegalFieldValueException e) {
                                return null;
                            }
                        } else {

                            p = Pattern.compile(p6);
                            date = p.matcher(url);
                            if (date.find()) {
                                String[] token = date.group().substring(1).split("/");
                                try {
                                    extractedDate = LocalDate.parse((token[0] + token[1] + "15").toString(),
                                            dateFormat);
                                } catch (IllegalFieldValueException e) {
                                    return null;
                                }
                            } else {
                                p = Pattern.compile(p5);
                                date = p.matcher(url);
                                if (date.find()) {
                                    String[] token = date.group().substring(1).split("/");
                                    try {
                                        extractedDate = LocalDate.parse((token[0] + token[1] + "15").toString(),
                                                dateFormat);
                                    } catch (IllegalFieldValueException e) {
                                        return null;
                                    }
                                } else {
                                    p = Pattern.compile(p8);
                                    date = p.matcher(url);
                                    if (date.find()) {
                                        String[] token = date.group().substring(1).split("_");
                                        try {
                                            extractedDate = LocalDate.parse(token[0] + token[1] + token[2],
                                                    dateFormat);
                                        } catch (IllegalFieldValueException e) {
                                            return null;
                                        }
                                    } else {
                                        p = Pattern.compile(p9);
                                        date = p.matcher(url);
                                        if (date.find()) {
                                            String[] token = date.group().substring(1).split("_");
                                            try {
                                                extractedDate = LocalDate.parse(token[0] + token[1] + "15",
                                                        dateFormat);
                                            } catch (IllegalFieldValueException e) {
                                                return null;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        return extractedDate;
    }

    public static Pair<String, String> extractDateFromURL_(String url) {
        Pair<String, String> extractedDate = null;
        if (url.contains("_"))
            url = url.replace("_", "/");
        Pattern p = Pattern.compile(p1);
        Matcher date = p.matcher(url);
        if (date.find()) {
            String[] token = date.group().substring(1).split("/");

            String month = token[0];
            if (month.contains("January") || month.contains("Jan")) {
                month = "01";
            } else if (month.contains("February") || month.contains("Feb")) {
                month = "02";
            } else if (month.contains("March") || month.contains("Mar")) {
                month = "03";
            } else if (month.contains("April") || month.contains("Apr")) {
                month = "04";
            } else if (month.contains("May")) {
                month = "05";
            } else if (month.contains("June") || month.contains("Jun")) {
                month = "06";
            } else if (month.contains("July") || month.contains("Jul")) {
                month = "07";
            } else if (month.contains("August") || month.contains("Aug")) {
                month = "08";
            } else if (month.contains("September") || month.contains("Sep")) {
                month = "09";
            } else if (month.contains("October") || month.contains("Oct")) {
                month = "10";
            } else if (month.contains("November") || month.contains("Nov")) {
                month = "11";
            } else if (month.contains("December") || month.contains("Dec")) {
                month = "12";
            }

            extractedDate = Pair.makePair(
                    LocalDate.parse((token[1] + month + "15").toString(), dateFormat).toString(), "strong");
        } else {

            p = Pattern.compile(p2);
            date = p.matcher(url);
            if (date.find()) {
                String[] token = date.group().substring(1).split("/");

                String month = token[0];
                if (month.contains("January") || month.contains("Jan")) {
                    month = "01";
                } else if (month.contains("February") || month.contains("Feb")) {
                    month = "02";
                } else if (month.contains("March") || month.contains("Mar")) {
                    month = "03";
                } else if (month.contains("April") || month.contains("Apr")) {
                    month = "04";
                } else if (month.contains("May")) {
                    month = "05";
                } else if (month.contains("June") || month.contains("Jun")) {
                    month = "06";
                } else if (month.contains("July") || month.contains("Jul")) {
                    month = "07";
                } else if (month.contains("August") || month.contains("Aug")) {
                    month = "08";
                } else if (month.contains("September") || month.contains("Sep")) {
                    month = "09";
                } else if (month.contains("October") || month.contains("Oct")) {
                    month = "10";
                } else if (month.contains("November") || month.contains("Nov")) {
                    month = "11";
                } else if (month.contains("December") || month.contains("Dec")) {
                    month = "12";
                }

                extractedDate = Pair.makePair(
                        LocalDate.parse((token[1] + month + "15").toString(), dateFormat).toString(),
                        "mildly strong");
            } else {

                p = Pattern.compile(p3);
                date = p.matcher(url);
                if (date.find()) {
                    String[] token = date.group().substring(1).split("/");
                    try {
                        extractedDate = Pair.makePair(
                                LocalDate.parse((token[1] + token[0] + "15").toString(), dateFormat).toString(),
                                "mildy strong");
                    } catch (IllegalFieldValueException e) {
                        return null;
                    }
                } else {

                    p = Pattern.compile(p4);
                    date = p.matcher(url);
                    if (date.find()) {
                        String[] token = date.group().substring(1).split("/");
                        try {
                            extractedDate = Pair.makePair(LocalDate
                                    .parse((token[1] + "0" + token[0] + "15").toString(), dateFormat).toString(),
                                    "mildly strong");
                        } catch (IllegalFieldValueException e) {
                            return null;
                        }
                    } else {
                        p = Pattern.compile(p7);
                        date = p.matcher(url);
                        if (date.find()) {
                            String[] token = date.group().substring(1).split("/");
                            try {
                                extractedDate = Pair.makePair(
                                        LocalDate.parse(token[0] + token[1] + token[2], dateFormat).toString(),
                                        "very strong");
                            } catch (IllegalFieldValueException e) {
                                return null;
                            }
                        } else {

                            p = Pattern.compile(p6);
                            date = p.matcher(url);
                            if (date.find()) {
                                String[] token = date.group().substring(1).split("/");
                                try {
                                    extractedDate = Pair.makePair(LocalDate
                                            .parse((token[0] + token[1] + "15").toString(), dateFormat).toString(),
                                            "mildly strong");
                                } catch (IllegalFieldValueException e) {
                                    return null;
                                }
                            } else {
                                p = Pattern.compile(p5);
                                date = p.matcher(url);
                                if (date.find()) {
                                    String[] token = date.group().substring(1).split("/");
                                    try {
                                        extractedDate = Pair.makePair(
                                                LocalDate.parse((token[0] + token[1] + "15").toString(), dateFormat)
                                                        .toString(),
                                                "mildly strong");
                                    } catch (IllegalFieldValueException e) {
                                        return null;
                                    }
                                } else {
                                    p = Pattern.compile(p8);
                                    date = p.matcher(url);
                                    if (date.find()) {
                                        String[] token = date.group().substring(1).split("_");
                                        try {
                                            extractedDate = Pair.makePair(LocalDate
                                                    .parse(token[0] + token[1] + token[2], dateFormat).toString(),
                                                    "very strong");
                                        } catch (IllegalFieldValueException e) {
                                            return null;
                                        }
                                    } else {
                                        p = Pattern.compile(p9);
                                        date = p.matcher(url);
                                        if (date.find()) {
                                            String[] token = date.group().substring(1).split("_");
                                            try {
                                                extractedDate = Pair.makePair(LocalDate
                                                        .parse(token[0] + token[1] + "15", dateFormat).toString(),
                                                        "mildly strong");
                                            } catch (IllegalFieldValueException e) {
                                                return null;
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }

        return extractedDate;
    }

    public static String replaceLast(String input, String regex, String replacement) {
        Pattern pattern = Pattern.compile(regex);
        Matcher matcher = pattern.matcher(input);
        if (!matcher.find()) {
            return input;
        }
        int lastMatchStart = 0;
        do {
            lastMatchStart = matcher.start();
        } while (matcher.find());
        matcher.find(lastMatchStart);
        StringBuffer sb = new StringBuffer(input.length());
        matcher.appendReplacement(sb, replacement);
        matcher.appendTail(sb);
        return sb.toString();
    }

    public static void main(String[] args) {
        System.out.println(DateUtil.extractDateFromURL("http://0009.org/blog/index.php/2006/10/"));

    }
}

class BlogDocument {
    public String docno;
    public String permalink;
    public String date_xml;
    public String dochdr;
    public String content;
    public static final String DOC = "<DOC>";
    public static final String DOC_ = "</DOC>";
    public static final String DOCNO = "<DOCNO>";
    public static final String DOCNO_ = "</DOCNO>";
    public static final String DATE_XML = "<DATE_XML>";
    public static final String DATE_XML_ = "</DATE_XML>";
    public static final String PERMALINK = "<PERMALINK>";
    public static final String PERMALINK_ = "</PERMALINK>";
    public static final String DOCHDR = "<DOCHDR>";
    public static final String DOCHDR_ = "</DOCHDR>";
    public static final String DOCTEXT = "<html>";
    public static final String TITLE = "<title>";

    public BlogDocument(String docno, String permalink, String date_xml, String dochdr) {
        this.docno = docno;
        this.permalink = permalink;
        this.date_xml = date_xml;
        this.dochdr = dochdr;
    }

    public BlogDocument() {
    }

}