List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData getProps
public Properties getProps()
From source file:com.tamingtext.qa.WikipediaWexIndexer.java
License:Apache License
public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaWEX != null && wikipediaWEX.isFile()) { WexWikiContentSource contentSource = new WexWikiContentSource(); Properties properties = new Properties(); // fileName = config.get("docs.file", null); String filePath = wikipediaWEX.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();//w ww . j a v a2 s . com // docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(docData.getID())); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); String[] categories = docData.getProps().getProperty("category").split(";;"); for (String c : categories) { sDoc.addField("category", c); } if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaWEX); } return result; }
From source file:info.boytsov.lucene.parsers.ClueWeb09ContentSource.java
License:Open Source License
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { WarcRecord CurrRec = null;//from w w w . j a v a 2s . c om // protect reading from the TREC files by multiple threads. The rest of the // method, i.e., parsing the content and returning the DocData can run unprotected. synchronized (lock) { if (reader == null) { openNextFile(); } do { CurrRec = WarcRecord.readNextWarcRecord(reader); /* * We need to skip special auxiliary entries, e.g., in the * beginning of the file. */ } while (CurrRec != null && !CurrRec.getHeaderRecordType().equals("response")); if (CurrRec == null) { openNextFile(); return getNextDocData(docData); } } Date date = parseDate(CurrRec.getHeaderMetadataItem("WARC-Date")); String url = CurrRec.getHeaderMetadataItem("WARC-Target-URI"); // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { String Response = CurrRec.getContentUTF8(); int EndOfHead = Response.indexOf("\n\n"); if (EndOfHead >= 0) { String html = Response.substring(EndOfHead + 2); Properties props = new Properties(); docData = htmlParser.parse(docData, url, date, new StringReader(html), this); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); } else { /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ System.err.println("Cannot extract HTML in URI: " + url); } } else { /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ System.err.println("Ignoring schema in URI: " + url); } addItem(); return docData; }
From source file:info.boytsov.lucene.parsers.TrecGov2Parser.java
License:Apache License
@Override public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) throws IOException { // skip some of the non-html text, optionally set date Date date = null;/*from w w w . jav a 2s . c o m*/ int start = 0; final int h1 = docBuf.indexOf(DOCHDR); if (h1 >= 0) { final int hStart2dLine = h1 + DOCHDR.length() + 1; final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine); if (hEnd2dLine >= 0) { String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim(); if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1); final String dateStr = extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.parseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.length(); final String html = docBuf.substring(start); docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); return docData; } else { System.err.println("Ignoring schema in URI: " + url); } } else { System.err.println("Invalid header: " + docBuf.toString()); } } /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ return docData; }
From source file:parsers.ClueWebContentSource.java
License:Open Source License
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { WarcRecord CurrRec = null;// ww w . j av a 2 s. co m // protect reading from the TREC files by multiple threads. The rest of the // method, i.e., parsing the content and returning the DocData can run unprotected. synchronized (lock) { if (reader == null) { openNextFile(); } do { CurrRec = WarcRecord.readNextWarcRecord(reader); /* * We need to skip special auxiliary entries, e.g., in the * beginning of the file. */ } while (CurrRec != null && !CurrRec.getHeaderRecordType().equals("response")); if (CurrRec == null) { openNextFile(); return getNextDocData(docData); } } Date date = parseDate(CurrRec.getHeaderMetadataItem("WARC-Date")); String url = CurrRec.getHeaderMetadataItem("WARC-Target-URI"); String trecId = CurrRec.getHeaderMetadataItem("WARC-TREC-ID"); if (null == trecId) throw new RuntimeException("No WARC-TREC-ID field for url: '" + url + "'"); // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { // In ClueWeb09, the HTTP response was incorrectly terminated by \n\n instead of \r\n\r\n // as requested by the standard // So, to make all ClueWeb12 documents parseable with the old approach, we replace the first // \r\n\r\n with \n\n and will proceed as if we have ClueWeb09 String Response = CurrRec.getContentUTF8().replaceFirst("\r\n\r\n", "\n\n"); int EndOfHead = Response.indexOf("\n\n"); if (EndOfHead >= 0) { String html = Response.substring(EndOfHead + 2); //System.out.println(html); //System.out.println("===================="); docData = htmlParser.parse(docData, url, date, new StringReader(html), this); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); docData.setName(trecId); } else { /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ System.err.println("Cannot extract HTML in URI: " + url); } } else { /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ System.err.println("Ignoring schema in URI: " + url); } addItem(); return docData; }
From source file:parsers.TrecGov2Parser.java
License:Apache License
@Override public DocData parse(DocData docData, String name, TrecContentSource trecSrc, StringBuilder docBuf, ParsePathType pathType) throws IOException { // skip some of the non-html text, optionally set date Date date = null;//w ww . j a va2s . c om int start = 0; final int h1 = docBuf.indexOf(DOCHDR); if (h1 >= 0) { final int hStart2dLine = h1 + DOCHDR.length() + 1; final int hEnd2dLine = docBuf.indexOf("\n", hStart2dLine); if (hEnd2dLine >= 0) { String url = docBuf.substring(hStart2dLine, hEnd2dLine).toLowerCase().trim(); if (url.startsWith("http://") || url.startsWith("ftp://") || url.startsWith("https://")) { final int h2 = docBuf.indexOf(TERMINATING_DOCHDR, h1); final String dateStr = extract(docBuf, DATE, DATE_END, h2, null); if (dateStr != null) { date = trecSrc.parseDate(dateStr); } start = h2 + TERMINATING_DOCHDR.length(); final String html = docBuf.substring(start); docData = trecSrc.getHtmlParser().parse(docData, name, date, new StringReader(html), trecSrc); // This should be done after parse(), b/c parse() resets properties docData.getProps().put("url", url); docData.setName(name); return docData; } else { System.err.println("Ignoring schema in URI: " + url); } } else { throw new RuntimeException("Invalid header: " + docBuf.toString()); } } /* * TODO: @leo What do we do here exactly? * The interface doesn't allow us to signal that an entry should be skipped. */ return docData; }