List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData setProps
public void setProps(Properties props)
From source file:com.tamingtext.qa.WexWikiContentSource.java
License:Apache License
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { if (ir == null) { ir = getReader(file);//from w w w . j ava 2 s. c o m } String[] tuple = parser.next(); if (tuple == null) return null; docData.clear(); docData.setID(Integer.parseInt(tuple[ID])); docData.setTitle(tuple[TITLE]); docData.setBody(tuple[BODY]); docData.setDate(tuple[DATE]); props.setProperty("category", tuple[CATEGORY]); docData.setProps(props); return docData; }
From source file:info.boytsov.lucene.parsers.DemoHTMLParser.java
License:Apache License
public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc) throws IOException, SAXException { String bodyText = ""; String title = ""; try {//www . j ava2 s. com Parser p = new Parser(source); // properties final Properties docProps = p.metaTags; String dateStr = docProps.getProperty("date"); if (dateStr != null) { final Date newDate = trecSrc.parseDate(dateStr); if (newDate != null) { date = newDate; } } for (Entry<Object, Object> entry : docProps.entrySet()) { bodyText = bodyText + " " + entry.getKey() + " " + entry.getValue(); } title = p.title; bodyText = title + " " + bodyText + " " + p.body; } catch (Exception e) { System.err.println("Parsing error: " + e.getMessage()); } docData.clear(); docData.setName(name); docData.setTitle(title); docData.setBody(bodyText); docData.setProps(new Properties()); docData.setDate(date); return docData; }
From source file:info.boytsov.lucene.parsers.EnwikiContentSource.java
License:Apache License
@Override public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String[] tuple = parser.next(); docData.clear();//from w w w .j a v a 2 s . c om docData.setName(tuple[ID]); docData.setBody(tuple[TITLE] + " " + tuple[BODY]); docData.setDate(tuple[DATE]); docData.setTitle(tuple[TITLE]); /* * TODO: @leo This is not a real URL, maybe we will need a real URL some day. * This should be fine for sorting purposes, though. If the input * is unsorted and we want to produce sorted document ids, * this is just fine. */ Properties props = new Properties(); props.put("url", tuple[TITLE]); docData.setProps(props); return docData; }
From source file:info.boytsov.lucene.parsers.LeoHTMLParser.java
License:Open Source License
public DocData parse(DocData docData, String name, Date date, InputSource source, ContentSourceDateUtil trecSrc) throws IOException { String title = ""; String bodyText = ""; String baseHref = "http://fake-domain.com"; String encoding = "utf8"; /*//from w ww .j a va 2s.c o m * * This is clearly not the most efficient way to parse, * but it is much more stable. * */ StringWriter writer = new StringWriter(); BufferedReader br = new BufferedReader(source.getCharacterStream()); String line; while (null != (line = br.readLine())) { writer.append(line); } br.close(); String html = writer.toString(); try { Parser HtmlParser = Parser.createParser(html, encoding); LeoCleanerUtil res = new LeoCleanerUtil(baseHref); HtmlParser.visitAllNodesWith(res); title = res.GetTitleText(); bodyText = title + " " + res.GetDescriptionText() + " " + res.GetKeywordText() + " " + res.GetBodyText(); } catch (ParserException e) { System.err.println(" Parser exception: " + e + " trying simple conversion"); // Plan B!!! Pair<String, String> sres = LeoCleanerUtil.SimpleProc(html); title = sres.getFirst(); bodyText = title + " " + sres.getSecond(); } docData.clear(); docData.setName(name); docData.setTitle(title); docData.setBody(bodyText); docData.setProps(new Properties()); docData.setDate(date); return docData; }