List of usage examples for org.apache.lucene.benchmark.byTask.feeds DocData DocData
DocData
From source file:com.datastax.dse.demos.solr.Wikipedia.java
License:Open Source License
public static void indexWikipedia() { HttpSolrServer solrClient = null;//from w w w . ja v a 2 s . com try { Properties p = new Properties(); p.setProperty("keep.image.only.docs", "false"); p.setProperty("docs.file", wikifile); Config config = new Config(p); source = new EnwikiContentSource(); source.setConfig(config); source.resetInputs(); solrClient = new HttpSolrServer(url); if (null != user && null != password) { AbstractHttpClient httpClient = (AbstractHttpClient) solrClient.getHttpClient(); httpClient.addRequestInterceptor(new PreEmptiveBasicAuthenticator(user, password)); } DocData docData = new DocData(); String firstName = null; SolrInputDocument doc = new SolrInputDocument(); int i = 0; for (int x = 0; x < limit; x++) { if (i > 0 && i % 1000 == 0) System.out.println("Indexed " + i++); docData = source.getNextDocData(docData); if (firstName == null) firstName = docData.getName(); else if (firstName.equals(docData.getName())) break; //looped if (addDoc(doc, docData)) { solrClient.add(doc); i++; } } } catch (NoMoreDataException e) { } catch (Exception e) { e.printStackTrace(); } finally { try { if (solrClient != null) solrClient.commit(); source.close(); } catch (Throwable t) { } } }
From source file:com.grantingersoll.intell.index.Indexer.java
License:Apache License
public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();//from w ww .j av a2 s . c o m //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", docData.getName()); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("docnum_i", String.valueOf(i)); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; }
From source file:com.tamingtext.qa.WikipediaIndexer.java
License:Apache License
public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();//from ww w . j a v a 2s . c o m //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(i)); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; }
From source file:com.tamingtext.qa.WikipediaWexIndexer.java
License:Apache License
public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaWEX != null && wikipediaWEX.isFile()) { WexWikiContentSource contentSource = new WexWikiContentSource(); Properties properties = new Properties(); // fileName = config.get("docs.file", null); String filePath = wikipediaWEX.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();//from w w w . j av a2 s. c o m // docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(docData.getID())); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); String[] categories = docData.getProps().getProperty("category").split(";;"); for (String c : categories) { sDoc.addField("category", c); } if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaWEX); } return result; }
From source file:io.anserini.index.transform.NekoStringTransform.java
License:Apache License
@Override public String apply(String s) { try {/*from w ww . ja va 2s . c o m*/ DocData dd = new DocData(); dd = dhp.parse(dd, "", null, new StringReader(s), null); return dd.getTitle() + "\n" + dd.getBody(); } catch (Exception e) { return ""; } }
From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java
License:Apache License
public static void main(String[] args) { Properties properties = new Properties(); InputStream input = null;//from w ww .j av a 2 s . c o m try { if (System.getProperty("properties.path") != null) { input = new FileInputStream(System.getProperty("properties.path")); properties.load(input); } else { logger.info("Loading default property file [resources/lucene-clef.properties]"); ClassLoader loader = Thread.currentThread().getContextClassLoader(); input = loader.getResourceAsStream("lucene-clef.properties"); properties.load(input); } } catch (IOException ex) { ex.printStackTrace(); } finally { if (input != null) { try { input.close(); } catch (IOException e) { e.printStackTrace(); } } } properties.putAll(System.getProperties()); String language = properties.getProperty("language"); String stemmer = properties.getProperty("stemmer"); String stopsetType = properties.getProperty("stopset.type"); String stopsetPath = null; if (stopsetType.equalsIgnoreCase("CUSTOM")) { stopsetPath = properties.getProperty("stopset.path"); } String corporaRootPath = properties.getProperty("corpora.path"); int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size")); String[] corpora = properties.getProperty(language + ".corpora").split(";"); TrecContentSource trecContentSource = new TrecContentSource(); try { Properties configProps = new Properties(); configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser"); configProps.setProperty("content.source.verbose", "false"); configProps.setProperty("content.source.forever", "false"); configProps.setProperty("content.source.excludeIteration", "true"); configProps.setProperty("work.dir", new File(".").getAbsolutePath()); configProps.setProperty("language", language); configProps.setProperty("stemmer", stemmer); configProps.setProperty("stopset_type", stopsetType); configProps.setProperty("stopset_path", stopsetPath); // set lucene index directory Path indexPath = new File(properties.getProperty("index.path")).toPath(); Directory directory = new SimpleFSDirectory(indexPath); // indexing configuration CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath); Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setSimilarity(new BM25Similarity()); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, conf); boolean storePositions = true; FieldType bodyFieldType = new FieldType(); if (storePositions) { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } else { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } for (String corpus : corpora) { int docCount = 0; logger.info("... indexing corpus " + corpus); try { configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus); configProps.setProperty("content.source.encoding", properties.getProperty(corpus + ".encoding", "UTF-8")); trecContentSource.setConfig(new Config(configProps)); DocData docData = new DocData(); while ((docData = trecContentSource.getNextDocData(docData)) != null) { docCount++; // System.out.println("ID: "+docData.getName()); // System.out.println("BODY: "+docData.getBody()); Document doc = getDocumentFromDocData(docData, bodyFieldType); indexWriter.addDocument(doc); } } catch (NoMoreDataException e) { logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n"); } } indexWriter.close(); DirectoryReader ireader = DirectoryReader.open(directory); if (corpusSize != ireader.numDocs()) { throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be " + corpusSize); } logger.info("Number of documents: " + ireader.numDocs()); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }