List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData getTitle
public String getTitle()
From source file:com.datastax.dse.demos.solr.Wikipedia.java
License:Open Source License
public static boolean addDoc(SolrInputDocument doc, DocData d) { if (d.getTitle().indexOf(":") > 0) return false; doc.clear();//from w w w . j a va 2 s . c o m doc.addField("id", d.getName()); doc.addField("title", d.getTitle()); doc.addField("body", d.getBody()); doc.addField("date", d.getDate()); return true; }
From source file:com.grantingersoll.intell.index.Indexer.java
License:Apache License
public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();/*from ww w .j a v a 2 s. c om*/ //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", docData.getName()); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("docnum_i", String.valueOf(i)); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; }
From source file:com.tamingtext.qa.WikipediaIndexer.java
License:Apache License
public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();/*from w ww . j av a 2 s .c om*/ //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(i)); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; }
From source file:com.tamingtext.qa.WikipediaWexIndexer.java
License:Apache License
public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaWEX != null && wikipediaWEX.isFile()) { WexWikiContentSource contentSource = new WexWikiContentSource(); Properties properties = new Properties(); // fileName = config.get("docs.file", null); String filePath = wikipediaWEX.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();//from ww w . j ava 2s .c o m // docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(docData.getID())); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); String[] categories = docData.getProps().getProperty("category").split(";;"); for (String c : categories) { sDoc.addField("category", c); } if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaWEX); } return result; }
From source file:io.anserini.index.transform.NekoStringTransform.java
License:Apache License
@Override public String apply(String s) { try {/* ww w . ja v a 2s. co m*/ DocData dd = new DocData(); dd = dhp.parse(dd, "", null, new StringReader(s), null); return dd.getTitle() + "\n" + dd.getBody(); } catch (Exception e) { return ""; } }