Example usage for org.apache.lucene.benchmark.byTask.feeds DocData setDate

List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData setDate

Introduction

In this page you can find the example usage for org.apache.lucene.benchmark.byTask.feeds DocData setDate.

Prototype

public void setDate(String date) 

Source Link

Usage

From source file:com.grantingersoll.intell.index.EnwikiContentSource.java

License:Apache License

@Override
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    String[] tuple = parser.next();
    docData.clear();//from ww  w  .  j  av  a  2 s.com
    docData.setName(tuple[ID]);
    docData.setBody(tuple[BODY]);
    docData.setDate(tuple[DATE]);
    docData.setTitle(tuple[TITLE]);
    return docData;
}

From source file:com.tamingtext.qa.WexWikiContentSource.java

License:Apache License

@Override
public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    if (ir == null) {
        ir = getReader(file);/*from   w w  w  . ja v  a 2 s . com*/
    }

    String[] tuple = parser.next();
    if (tuple == null)
        return null;

    docData.clear();
    docData.setID(Integer.parseInt(tuple[ID]));
    docData.setTitle(tuple[TITLE]);
    docData.setBody(tuple[BODY]);
    docData.setDate(tuple[DATE]);

    props.setProperty("category", tuple[CATEGORY]);
    docData.setProps(props);

    return docData;
}

From source file:info.boytsov.lucene.parsers.DemoHTMLParser.java

License:Apache License

public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc)
        throws IOException, SAXException {
    String bodyText = "";
    String title = "";
    try {/*from  w ww .j a  v a2  s .c om*/
        Parser p = new Parser(source);

        // properties 
        final Properties docProps = p.metaTags;
        String dateStr = docProps.getProperty("date");
        if (dateStr != null) {
            final Date newDate = trecSrc.parseDate(dateStr);
            if (newDate != null) {
                date = newDate;
            }
        }

        for (Entry<Object, Object> entry : docProps.entrySet()) {
            bodyText = bodyText + " " + entry.getKey() + " " + entry.getValue();
        }

        title = p.title;
        bodyText = title + " " + bodyText + " " + p.body;
    } catch (Exception e) {
        System.err.println("Parsing error: " + e.getMessage());
    }

    docData.clear();
    docData.setName(name);
    docData.setTitle(title);
    docData.setBody(bodyText);
    docData.setProps(new Properties());
    docData.setDate(date);
    return docData;
}

From source file:info.boytsov.lucene.parsers.EnwikiContentSource.java

License:Apache License

@Override
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    String[] tuple = parser.next();
    docData.clear();/*from  ww w. j a va  2 s . c  o m*/
    docData.setName(tuple[ID]);
    docData.setBody(tuple[TITLE] + " " + tuple[BODY]);
    docData.setDate(tuple[DATE]);
    docData.setTitle(tuple[TITLE]);
    /*
     *  TODO: @leo This is not a real URL, maybe we will need a real URL some day.
     *             This should be fine for sorting purposes, though. If the input
     *             is unsorted and we want to produce sorted document ids,
     *             this is just fine.
     */
    Properties props = new Properties();
    props.put("url", tuple[TITLE]);
    docData.setProps(props);
    return docData;
}

From source file:info.boytsov.lucene.parsers.LeoHTMLParser.java

License:Open Source License

public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc)
        throws IOException {
    String title = "";
    String bodyText = "";

    String baseHref = "http://fake-domain.com";
    String encoding = "utf8";

    /*//w w  w .j  av a2 s.  com
     * 
     * This is clearly not the most efficient way to parse,
     * but it is much more stable.
     * 
     */
    StringWriter writer = new StringWriter();
    BufferedReader br = new BufferedReader(source.getCharacterStream());

    String line;
    while (null != (line = br.readLine())) {
        writer.append(line);
    }
    br.close();

    String html = writer.toString();

    try {
        Parser HtmlParser = Parser.createParser(html, encoding);

        LeoCleanerUtil res = new LeoCleanerUtil(baseHref);
        HtmlParser.visitAllNodesWith(res);

        title = res.GetTitleText();

        bodyText = title + " " + res.GetDescriptionText() + " " + res.GetKeywordText() + " "
                + res.GetBodyText();

    } catch (ParserException e) {
        System.err.println(" Parser exception: " + e + " trying simple conversion");
        // Plan B!!!
        Pair<String, String> sres = LeoCleanerUtil.SimpleProc(html);

        title = sres.getFirst();
        bodyText = title + " " + sres.getSecond();
    }

    docData.clear();
    docData.setName(name);
    docData.setTitle(title);
    docData.setBody(bodyText);
    docData.setProps(new Properties());
    docData.setDate(date);

    return docData;
}