List of usage examples for org.apache.lucene.benchmark.byTask.feeds TrecContentSource getHtmlParser
HTMLParser getHtmlParser()
From source file:info.boytsov.lucene.parsers.TrecGov2Parser.java
License:Apache License
@Override public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) throws IOException { // skip some of the non-html text, optionally set date Date date = null;//from ww w .jav a2s . c om int start = 0; final int h1 = docBuf.indexOf(DOCHDR); if (h1 >= 0) { final int hStart2dLine = h1 + DOCHDR.length() + 1; final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine); if (hEnd2dLine >= 0) { String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim(); if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1); final String dateStr = extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.parseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.length(); final String html = docBuf.substring(start); docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); return docData; } else { System.err.println("Ignoring schema in URI: " + url); } } else { System.err.println("Invalid header: " + docBuf.toString()); } } /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ return docData; }
From source file:parsers.TrecGov2Parser.java
License:Apache License
@Override public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) throws IOException { // skip some of the non-html text, optionally set date Date date = null;/*from ww w . j av a2 s . co m*/ int start = 0; final int h1 = docBuf.indexOf(DOCHDR); if (h1 >= 0) { final int hStart2dLine = h1 + DOCHDR.length() + 1; final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine); if (hEnd2dLine >= 0) { String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim(); if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1); final String dateStr = extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.parseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.length(); final String html = docBuf.substring(start); docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); docData.setName(name); return docData; } else { System.err.println("Ignoring schema in URI: " + url); } } else { throw new RuntimeException("Invalid header: " + docBuf.toString()); } } /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ return docData; }