List of usage examples for org.apache.lucene.benchmark.byTask.feeds EnwikiContentSource EnwikiContentSource
EnwikiContentSource
From source file:com.datastax.dse.demos.solr.Wikipedia.java
License:Open Source License
public static void indexWikipedia() { HttpSolrServer solrClient = null;/*from ww w. ja v a2 s .c o m*/ try { Properties p = new Properties(); p.setProperty("keep.image.only.docs", "false"); p.setProperty("docs.file", wikifile); Config config = new Config(p); source = new EnwikiContentSource(); source.setConfig(config); source.resetInputs(); solrClient = new HttpSolrServer(url); if (null != user && null != password) { AbstractHttpClient httpClient = (AbstractHttpClient) solrClient.getHttpClient(); httpClient.addRequestInterceptor(new PreEmptiveBasicAuthenticator(user, password)); } DocData docData = new DocData(); String firstName = null; SolrInputDocument doc = new SolrInputDocument(); int i = 0; for (int x = 0; x < limit; x++) { if (i > 0 && i % 1000 == 0) System.out.println("Indexed " + i++); docData = source.getNextDocData(docData); if (firstName == null) firstName = docData.getName(); else if (firstName.equals(docData.getName())) break; //looped if (addDoc(doc, docData)) { solrClient.add(doc); i++; } } } catch (NoMoreDataException e) { } catch (Exception e) { e.printStackTrace(); } finally { try { if (solrClient != null) solrClient.commit(); source.close(); } catch (Throwable t) { } } }
From source file:com.tamingtext.qa.WikipediaIndexer.java
License:Apache License
public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();//from ww w .j av a 2 s .c o m //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(i)); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; }