List of usage examples for org.apache.lucene.benchmark.byTask.utils Config Config
public Config(Properties props)
From source file:com.datastax.dse.demos.solr.Wikipedia.java
License:Open Source License
public static void indexWikipedia() { HttpSolrServer solrClient = null;/* w ww .j a v a2 s . c o m*/ try { Properties p = new Properties(); p.setProperty("keep.image.only.docs", "false"); p.setProperty("docs.file", wikifile); Config config = new Config(p); source = new EnwikiContentSource(); source.setConfig(config); source.resetInputs(); solrClient = new HttpSolrServer(url); if (null != user && null != password) { AbstractHttpClient httpClient = (AbstractHttpClient) solrClient.getHttpClient(); httpClient.addRequestInterceptor(new PreEmptiveBasicAuthenticator(user, password)); } DocData docData = new DocData(); String firstName = null; SolrInputDocument doc = new SolrInputDocument(); int i = 0; for (int x = 0; x < limit; x++) { if (i > 0 && i % 1000 == 0) System.out.println("Indexed " + i++); docData = source.getNextDocData(docData); if (firstName == null) firstName = docData.getName(); else if (firstName.equals(docData.getName())) break; //looped if (addDoc(doc, docData)) { solrClient.add(doc); i++; } } } catch (NoMoreDataException e) { } catch (Exception e) { e.printStackTrace(); } finally { try { if (solrClient != null) solrClient.commit(); source.close(); } catch (Throwable t) { } } }
From source file:com.grantingersoll.intell.index.Indexer.java
License:Apache License
public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();/* w w w. j a va2 s . c o m*/ //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", docData.getName()); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("docnum_i", String.valueOf(i)); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; }
From source file:com.tamingtext.qa.WikipediaIndexer.java
License:Apache License
public int index(File wikipediaXML, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaXML != null && wikipediaXML.exists()) { EnwikiContentSource contentSource = new EnwikiContentSource(); Properties properties = new Properties(); //fileName = config.get("docs.file", null); String filePath = wikipediaXML.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();//from w w w.ja va 2 s .com //docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(i)); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaXML); } return result; }
From source file:com.tamingtext.qa.WikipediaWexIndexer.java
License:Apache License
public int index(File wikipediaWEX, int numDocs, int batchSize) throws Exception { int result = 0; if (wikipediaWEX != null && wikipediaWEX.isFile()) { WexWikiContentSource contentSource = new WexWikiContentSource(); Properties properties = new Properties(); // fileName = config.get("docs.file", null); String filePath = wikipediaWEX.getAbsolutePath(); properties.setProperty("docs.file", filePath); properties.setProperty("doc.maker.forever", "false"); contentSource.setConfig(new Config(properties)); contentSource.resetInputs();/* w ww .ja v a 2 s . c om*/ // docMaker.openFile(); List<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(1000); int i = 0; SolrInputDocument sDoc = null; long start = System.currentTimeMillis(); try { DocData docData = new DocData(); while ((docData = contentSource.getNextDocData(docData)) != null && i < numDocs) { int mod = i % batchSize; sDoc = new SolrInputDocument(); docs.add(sDoc); sDoc.addField("file", filePath + "_" + i); sDoc.addField("docid", String.valueOf(docData.getID())); sDoc.addField("body", docData.getBody()); sDoc.addField("doctitle", docData.getTitle()); sDoc.addField("name_s", docData.getName()); String[] categories = docData.getProps().getProperty("category").split(";;"); for (String c : categories) { sDoc.addField("category", c); } if (mod == batchSize - 1) { log.info("Sending: " + docs.size() + " docs" + " total sent for this file: " + i); server.add(docs); docs.clear(); } i++; } } catch (NoMoreDataException e) { } long finish = System.currentTimeMillis(); if (log.isInfoEnabled()) { log.info("Indexing took " + (finish - start) + " ms"); } if (docs.size() > 0) { server.add(docs); } result = i + docs.size(); server.commit(); server.optimize(); } else { System.out.println("Can't find file: " + wikipediaWEX); } return result; }
From source file:info.boytsov.lucene.CreateIndex.java
License:Open Source License
public static void main(String[] args) throws Exception { if (args.length != 3 && args.length != 4) { printUsage();/*ww w. ja v a 2s . c o m*/ System.exit(1); } String indexType = args[0]; String indexSource = args[1]; int commitInterval = 1000000; if (args.length >= 4) { commitInterval = Integer.parseInt(args[3]); } System.out.println("Commiting after indexing " + commitInterval + " docs"); File outputDir = new File(args[2]); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); return; } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); return; } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); return; } FSDirectory dir = FSDirectory.open(outputDir); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);// default // stop // words IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);// overwrites // if // needed IndexWriter indexWriter = new IndexWriter(dir, config); DocMaker docMaker = new DocMaker(); Properties properties = new Properties(); properties.setProperty("content.source.forever", "false"); // will // parse // each // document // only // once properties.setProperty("doc.index.props", "true"); // We want to store small-size fields like URL or even title ... properties.setProperty("doc.stored", "true"); // but not the large one (great savings, 3x reduction in space)! properties.setProperty("doc.body.stored", "false"); ContentSource source = CreateSource(indexType, indexSource, properties); if (source == null) { System.err.println("Failed to create a source: " + indexType + "(" + indexSource + ")"); printUsage(); System.exit(1); } Config c = new Config(properties); source.setConfig(c); source.resetInputs();// though this does not seem needed, it is // (gets the file opened?) docMaker.setConfig(c, source); int count = 0; System.out.println("Starting Indexing of " + indexType + " source " + indexSource); long start = System.currentTimeMillis(); Document doc; try { while ((doc = docMaker.makeDocument()) != null) { indexWriter.addDocument(doc); ++count; if (count % 5000 == 0) { System.out.println( "Indexed " + count + " documents in " + (System.currentTimeMillis() - start) + " ms"); } if (count % commitInterval == 0) { indexWriter.commit(); System.out.println("Committed"); } } } catch (org.apache.lucene.benchmark.byTask.feeds.NoMoreDataException nmd) { System.out.println("Caught NoMoreDataException! -- Finishing"); // All done } long finish = System.currentTimeMillis(); System.out.println("Indexing " + count + " documents took " + (finish - start) + " ms"); System.out.println("Total data processed: " + source.getTotalBytesCount() + " bytes"); System.out.println("Index should be located at " + dir.getDirectory().getAbsolutePath()); docMaker.close(); indexWriter.commit(); indexWriter.close(); }
From source file:io.anserini.index.IndexGov2.java
License:Apache License
private static TrecContentSource createGov2Source(String dataDir) { TrecContentSource tcs = new TrecContentSource(); Properties props = new Properties(); props.setProperty("print.props", "false"); props.setProperty("content.source.verbose", "false"); props.setProperty("content.source.excludeIteration", "true"); props.setProperty("docs.dir", dataDir); props.setProperty("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); props.setProperty("content.source.forever", "false"); tcs.setConfig(new Config(props)); try {/*from ww w. jav a2 s . com*/ tcs.resetInputs(); } catch (IOException e) { e.printStackTrace(); } return tcs; }
From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java
License:Apache License
public static void main(String[] args) { Properties properties = new Properties(); InputStream input = null;// www . j a va 2s .com try { if (System.getProperty("properties.path") != null) { input = new FileInputStream(System.getProperty("properties.path")); properties.load(input); } else { logger.info("Loading default property file [resources/lucene-clef.properties]"); ClassLoader loader = Thread.currentThread().getContextClassLoader(); input = loader.getResourceAsStream("lucene-clef.properties"); properties.load(input); } } catch (IOException ex) { ex.printStackTrace(); } finally { if (input != null) { try { input.close(); } catch (IOException e) { e.printStackTrace(); } } } properties.putAll(System.getProperties()); String language = properties.getProperty("language"); String stemmer = properties.getProperty("stemmer"); String stopsetType = properties.getProperty("stopset.type"); String stopsetPath = null; if (stopsetType.equalsIgnoreCase("CUSTOM")) { stopsetPath = properties.getProperty("stopset.path"); } String corporaRootPath = properties.getProperty("corpora.path"); int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size")); String[] corpora = properties.getProperty(language + ".corpora").split(";"); TrecContentSource trecContentSource = new TrecContentSource(); try { Properties configProps = new Properties(); configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser"); configProps.setProperty("content.source.verbose", "false"); configProps.setProperty("content.source.forever", "false"); configProps.setProperty("content.source.excludeIteration", "true"); configProps.setProperty("work.dir", new File(".").getAbsolutePath()); configProps.setProperty("language", language); configProps.setProperty("stemmer", stemmer); configProps.setProperty("stopset_type", stopsetType); configProps.setProperty("stopset_path", stopsetPath); // set lucene index directory Path indexPath = new File(properties.getProperty("index.path")).toPath(); Directory directory = new SimpleFSDirectory(indexPath); // indexing configuration CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath); Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setSimilarity(new BM25Similarity()); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, conf); boolean storePositions = true; FieldType bodyFieldType = new FieldType(); if (storePositions) { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } else { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } for (String corpus : corpora) { int docCount = 0; logger.info("... indexing corpus " + corpus); try { configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus); configProps.setProperty("content.source.encoding", properties.getProperty(corpus + ".encoding", "UTF-8")); trecContentSource.setConfig(new Config(configProps)); DocData docData = new DocData(); while ((docData = trecContentSource.getNextDocData(docData)) != null) { docCount++; // System.out.println("ID: "+docData.getName()); // System.out.println("BODY: "+docData.getBody()); Document doc = getDocumentFromDocData(docData, bodyFieldType); indexWriter.addDocument(doc); } } catch (NoMoreDataException e) { logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n"); } } indexWriter.close(); DirectoryReader ireader = DirectoryReader.open(directory); if (corpusSize != ireader.numDocs()) { throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be " + corpusSize); } logger.info("Number of documents: " + ireader.numDocs()); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:luceneingester.TrecIngester.java
License:Apache License
private static TrecContentSource createTrecSource(String dataDir) { TrecContentSource tcs = new TrecContentSource(); Properties props = new Properties(); props.setProperty("print.props", "false"); props.setProperty("content.source.verbose", "false"); props.setProperty("content.source.excludeIteration", "true"); props.setProperty("docs.dir", dataDir); props.setProperty("trec.doc.parser", "org.apache.lucene.benchmark.byTask.feeds.TrecGov2Parser"); props.setProperty("content.source.forever", "false"); tcs.setConfig(new Config(props)); try {/* w w w . j a v a2 s. co m*/ tcs.resetInputs(); } catch (IOException e) { e.printStackTrace(); } return tcs; }
From source file:source.ContentSourceSource.java
License:Apache License
public ContentSourceSource(String indexType, String indexSource) throws Exception { String typeLC = indexType.toUpperCase(); mProperties = new Properties(); // prevent an infinite parsing loop, which is a strange default here mProperties.setProperty("content.source.forever", "false"); if (typeLC.equals(SOURCE_TYPE_WIKIPEDIA)) { File wikipediafile = new File(indexSource); if (!wikipediafile.exists()) { throw new Exception("Can't find " + wikipediafile.getAbsolutePath()); }// w w w. ja v a2s. co m if (!wikipediafile.canRead()) { throw new Exception("Can't read " + wikipediafile.getAbsolutePath()); } mProperties.setProperty("docs.file", wikipediafile.getAbsolutePath()); mProperties.setProperty("keep.image.only.docs", "false"); mSource = new EnwikiContentSource(); } else if (typeLC.equals(SOURCE_TYPE_GOV2)) { String parserTREC = "parsers.TrecGov2Parser"; mProperties.setProperty("html.parser", "parsers.DemoHTMLParser"); mProperties.setProperty("trec.doc.parser", parserTREC); mProperties.setProperty("docs.dir", indexSource); mProperties.setProperty("work.dir", "/tmp"); mSource = new TrecContentSource(); } else if (typeLC.equals(SOURCE_TYPE_CLUEWEB)) { // parsers.DemoHTMLParser HTML parser fails on this collection //mProperties.setProperty("html.parser", "parsers.LeoHTMLParser"); mProperties.setProperty("html.parser", "parsers.DemoHTMLParser"); mProperties.setProperty("docs.dir", indexSource); mProperties.setProperty("work.dir", "/tmp"); mSource = new ClueWebContentSource(); } else { throw new Exception("Unsupported index type: " + indexType); } mConfig = new Config(mProperties); mSource.setConfig(mConfig); mSource.resetInputs(); // not clear if this is 100% needed, but let's keep it }