List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData setName
public void setName(String name)
From source file:com.grantingersoll.intell.index.EnwikiContentSource.java
License:Apache License
@Override public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String[] tuple = parser.next(); docData.clear();//w ww.j a v a2 s. co m docData.setName(tuple[ID]); docData.setBody(tuple[BODY]); docData.setDate(tuple[DATE]); docData.setTitle(tuple[TITLE]); return docData; }
From source file:info.boytsov.lucene.parsers.DemoHTMLParser.java
License:Apache License
public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc) throws IOException, SAXException { String bodyText = ""; String title = ""; try {/*from ww w .j a va 2 s. c om*/ Parser p = new Parser(source); // properties final Properties docProps = p.metaTags; String dateStr = docProps.getProperty("date"); if (dateStr != null) { final Date newDate = trecSrc.parseDate(dateStr); if (newDate != null) { date = newDate; } } for (Entry<Object, Object> entry : docProps.entrySet()) { bodyText = bodyText + " " + entry.getKey() + " " + entry.getValue(); } title = p.title; bodyText = title + " " + bodyText + " " + p.body; } catch (Exception e) { System.err.println("Parsing error: " + e.getMessage()); } docData.clear(); docData.setName(name); docData.setTitle(title); docData.setBody(bodyText); docData.setProps(new Properties()); docData.setDate(date); return docData; }
From source file:info.boytsov.lucene.parsers.EnwikiContentSource.java
License:Apache License
@Override public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String[] tuple = parser.next(); docData.clear();/*from ww w . j a va2s . c o m*/ docData.setName(tuple[ID]); docData.setBody(tuple[TITLE] + " " + tuple[BODY]); docData.setDate(tuple[DATE]); docData.setTitle(tuple[TITLE]); /* * TODO: @leo This is not a real URL, maybe we will need a real URL some day. * This should be fine for sorting purposes, though. If the input * is unsorted and we want to produce sorted document ids, * this is just fine. */ Properties props = new Properties(); props.put("url", tuple[TITLE]); docData.setProps(props); return docData; }
From source file:info.boytsov.lucene.parsers.LeoHTMLParser.java
License:Open Source License
public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc) throws IOException { String title = ""; String bodyText = ""; String baseHref = "http://fake-domain.com"; String encoding = "utf8"; /*//from www . j a va 2s .c o m * * This is clearly not the most efficient way to parse, * but it is much more stable. * */ StringWriter writer = new StringWriter(); BufferedReader br = new BufferedReader(source.getCharacterStream()); String line; while (null != (line = br.readLine())) { writer.append(line); } br.close(); String html = writer.toString(); try { Parser HtmlParser = Parser.createParser(html, encoding); LeoCleanerUtil res = new LeoCleanerUtil(baseHref); HtmlParser.visitAllNodesWith(res); title = res.GetTitleText(); bodyText = title + " " + res.GetDescriptionText() + " " + res.GetKeywordText() + " " + res.GetBodyText(); } catch (ParserException e) { System.err.println(" Parser exception: " + e + " trying simple conversion"); // Plan B!!! Pair<String, String> sres = LeoCleanerUtil.SimpleProc(html); title = sres.getFirst(); bodyText = title + " " + sres.getSecond(); } docData.clear(); docData.setName(name); docData.setTitle(title); docData.setBody(bodyText); docData.setProps(new Properties()); docData.setDate(date); return docData; }
From source file:it.unipd.dei.ims.lucene.clef.parser.ClefDocParser.java
License:Apache License
@Override public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) throws IOException { int mark = 0; // that much is skipped docData.clear();/* w ww . j a va 2 s . co m*/ docData.setName(name); docData.setBody(stripTags(docBuf, mark).toString()); return docData; }
From source file:parsers.ClueWebContentSource.java
License:Open Source License
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { WarcRecord CurrRec = null;// w ww.j ava 2 s . c o m // protect reading from the TREC files by multiple threads. The rest of the // method, i.e., parsing the content and returning the DocData can run unprotected. synchronized (lock) { if (reader == null) { openNextFile(); } do { CurrRec = WarcRecord.readNextWarcRecord(reader); /* * We need to skip special auxiliary entries, e.g., in the * beginning of the file. */ } while (CurrRec != null && !CurrRec.getHeaderRecordType().equals("response")); if (CurrRec == null) { openNextFile(); return getNextDocData(docData); } } Date date = parseDate(CurrRec.getHeaderMetadataItem("WARC-Date")); String url = CurrRec.getHeaderMetadataItem("WARC-Target-URI"); String trecId = CurrRec.getHeaderMetadataItem("WARC-TREC-ID"); if (null == trecId) throw new RuntimeException("No WARC-TREC-ID field for url: '" + url + "'"); // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { // In ClueWeb09, the HTTP response was incorrectly terminated by \n\n instead of \r\n\r\n // as requested by the standard // So, to make all ClueWeb12 documents parseable with the old approach, we replace the first // \r\n\r\n with \n\n and will proceed as if we have ClueWeb09 String Response = CurrRec.getContentUTF8().replaceFirst("\r\n\r\n", "\n\n"); int EndOfHead = Response.indexOf("\n\n"); if (EndOfHead >= 0) { String html = Response.substring(EndOfHead + 2); //System.out.println(html); //System.out.println("===================="); docData = htmlParser.parse(docData, url, date, new StringReader(html), this); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); docData.setName(trecId); } else { /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ System.err.println("Cannot extract HTML in URI: " + url); } } else { /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ System.err.println("Ignoring schema in URI: " + url); } addItem(); return docData; }
From source file:parsers.TrecGov2Parser.java
License:Apache License
@Override public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) throws IOException { // skip some of the non-html text, optionally set date Date date = null;// w w w .j a v a 2 s. c o m int start = 0; final int h1 = docBuf.indexOf(DOCHDR); if (h1 >= 0) { final int hStart2dLine = h1 + DOCHDR.length() + 1; final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine); if (hEnd2dLine >= 0) { String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim(); if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1); final String dateStr = extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.parseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.length(); final String html = docBuf.substring(start); docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); docData.setName(name); return docData; } else { System.err.println("Ignoring schema in URI: " + url); } } else { throw new RuntimeException("Invalid header: " + docBuf.toString()); } } /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ return docData; }