Example usage for org.apache.lucene.benchmark.byTask.feeds TrecContentSource parseDate

List of usage examples for org.apache.lucene.benchmark.byTask.feeds TrecContentSource parseDate

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds TrecContentSource parseDate.

Prototype

public Date parseDate(String dateStr) 

Source Link

Usage

From source file:info.boytsov.lucene.parsers.TrecGov2Parser.java

License:Apache License

@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf,
        ParsePathType pathType) throws IOException {
    // skip some of the non-html text, optionally set date
    Date date = null;/*from w  ww  .  ja  va2  s  .  com*/
    int start = 0;
    final int h1 = docBuf.indexOf(DOCHDR);
    if (h1 >= 0) {
        final int hStart2dLine = h1 + DOCHDR.length() + 1;
        final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine);

        if (hEnd2dLine >= 0) {
            String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim();

            if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
                final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
                final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
                if (dateStr != null) {
                    date = trecSrc.parseDate(dateStr);
                }
                start = h2 + TERMINATING_DOCHDR.length();

                final String html = docBuf.substring(start);
                docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
                // This should be done after parse(), b/c parse() resets properties
                docData.getProps().put("url", url);
                return docData;
            } else {
                System.err.println("Ignoring schema in URI: " + url);
            }
        } else {
            System.err.println("Invalid header: " + docBuf.toString());
        }
    }

    /*
     *  TODO: @leo What do we do here exactly? 
     *  The interface doesn't allow us to signal that an entry should be skipped. 
     */

    return docData;
}

From source file:parsers.TrecGov2Parser.java

License:Apache License

@Override
public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf,
        ParsePathType pathType) throws IOException {
    // skip some of the non-html text, optionally set date
    Date date = null;/*from ww  w . ja  v  a  2 s .c om*/
    int start = 0;
    final int h1 = docBuf.indexOf(DOCHDR);
    if (h1 >= 0) {
        final int hStart2dLine = h1 + DOCHDR.length() + 1;
        final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine);

        if (hEnd2dLine >= 0) {
            String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim();

            if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) {
                final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1);
                final String dateStr = extract(docBuf, DATE, DATE_END, h2, null);
                if (dateStr != null) {
                    date = trecSrc.parseDate(dateStr);
                }
                start = h2 + TERMINATING_DOCHDR.length();

                final String html = docBuf.substring(start);
                docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc);
                // This should be done after parse(), b/c parse() resets properties
                docData.getProps().put("url", url);
                docData.setName(name);
                return docData;
            } else {
                System.err.println("Ignoring schema in URI: " + url);
            }
        } else {
            throw new RuntimeException("Invalid header: " + docBuf.toString());
        }
    }

    /*
     *  TODO: @leo What do we do here exactly? 
     *  The interface doesn't allow us to signal that an entry should be skipped. 
     */

    return docData;
}