List of usage examples for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB
@Override public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB)
From source file:MakeLuceneIndex.java
License:Apache License
/** Index all text files under a directory. * @throws UnsupportedEncodingException * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { String baseDir = "/home/chrisschaefer/"; //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2"; String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2"; String luceneIndexName = "enwiki-20130604-lucene2"; System.currentTimeMillis();/* ww w . ja va2s . c o m*/ boolean bIgnoreStubs = false; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-luceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; if (args[i].equals("-dumpfile")) wikiDumpFile = args[++i]; if (args[i].equals("-includestubs")) bIgnoreStubs = true; } String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt"; String logPath = baseDir + luceneIndexName + ".log"; PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8"); PrintWriter logger = new PrintWriter(logPath, "UTF-8"); logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); Date start = new Date(); try { Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName)); Analyzer analyzer = new WikipediaAnalyzer(); // Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); iwc.setSimilarity(new ESASimilarity()); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // iwc.setRAMBufferSizeMB(2000.0); IndexWriter writer = new IndexWriter(dir, iwc); Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile); wikidumpExtractor.setLinkSeparator("_"); wikidumpExtractor.setCategorySeparator("_"); wikidumpExtractor.setTitleSeparator(" "); int iStubs = 0; int iArticleCount = 0; int iSkippedPageCount = 0; long iStartTime = java.lang.System.nanoTime(); long iTime = iStartTime; while (wikidumpExtractor.nextPage()) { if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) { ++iSkippedPageCount; continue; } if (bIgnoreStubs && wikidumpExtractor.getStub()) { ++iStubs; continue; } // skip pages with less than 5 out links if (wikidumpExtractor.getPageLinkList(true).size() < 5) { ++iSkippedPageCount; continue; } if (wikidumpExtractor.getPageCategories().equals("")) { ++iSkippedPageCount; logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false)); continue; } else { for (String link : wikidumpExtractor.getPageLinkList(false)) { // artikelTextWriter.println(link); if (_inLinks.containsKey(link)) { int tmp = _inLinks.get(link); tmp++; _inLinks.put(link, tmp); } else { _inLinks.put(link, 1); } } } if (wikidumpExtractor.getPageText().equals("")) { ++iSkippedPageCount; continue; } artikelTextWriter.println( wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false)); ++iArticleCount; if (iArticleCount % 1000 == 0) { logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } artikelTextWriter.close(); iArticleCount = 0; PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8"); BufferedReader br = new BufferedReader(new FileReader(rawTextPath)); String line = br.readLine(); while (line != null) { int endOfTitle = line.indexOf("\t"); String title = line.substring(0, endOfTitle); if (_inLinks.containsKey(title)) { int inlinks = _inLinks.get(title); artikelInLinkWriter.println(title + "\t" + inlinks); if (inlinks > 4) { //System.out.println("inlinks > 0 "); Document doc = new Document(); ++iArticleCount; // wikidumpExtractor.setTitleSeparator( "_" ); // doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES)); doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " " + line.substring(endOfTitle + 1), Field.Store.NO)); // System.out.println(title + " " + // title + " " + // title + " " + // title + " " + // line.substring(endOfTitle+1)); writer.addDocument(doc); if (iArticleCount % 1000 == 0) { writer.commit(); logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } } else { artikelInLinkWriter.println(title + "\t0"); } line = br.readLine(); } br.close(); artikelInLinkWriter.close(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.commit(); writer.forceMerge(1); writer.close(); Date end = new Date(); String endStatement = end.getTime() - start.getTime() + " total milliseconds (" + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles."; logger.println(endStatement); System.out.println(endStatement); logger.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:MakeLuceneIndexPreprocessed.java
License:Apache License
/** Index all text files under a directory. * @throws UnsupportedEncodingException * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { String baseDir = "/home/chrisschaefer/"; String inputLuceneIndexName = "2013-06-18-lucene-gab"; String luceneIndexName = "2013-06-18-lucene-gab-standard"; System.currentTimeMillis();/*from w w w. j av a2s . com*/ for (int i = 0; i < args.length; ++i) { if (args[i].equals("-inputluceneindex")) inputLuceneIndexName = args[++i]; if (args[i].equals("-outputluceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; } String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt"; String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt"; String logPath = baseDir + inputLuceneIndexName + ".log"; PrintWriter logger = new PrintWriter(logPath, "UTF-8"); logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); Date start = new Date(); logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0"); try { Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName)); // Analyzer analyzer = new WikipediaAnalyzer(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // iwc.setRAMBufferSizeMB(2000.0); // iwc.setSimilarity(new ESASimilarity()); IndexWriter writer = new IndexWriter(dir, iwc); int iArticleCount = 0; int iSkippedPageCount = 0; BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath)); BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath)); String lineText = rawTextReader.readLine(); String lineLinks = artikelInLinksReader.readLine(); while (lineText != null) { // String title = lineText.substring(0, lineText.indexOf("\t")); // while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){ // lineLinks = artikelInLinksReader.readLine(); // } int endOfTitle = lineText.indexOf("\t"); String title = lineText.substring(0, endOfTitle); if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) { ++iArticleCount; Document doc = new Document(); doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " " + lineText.substring(endOfTitle + 1), Field.Store.NO)); // System.out.println(title + " " + // title + " " + // title + " " + // title + " " + // lineText.substring(endOfTitle+1)); writer.addDocument(doc); if (iArticleCount % 1000 == 0) { writer.commit(); logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); logger.flush(); } } lineText = rawTextReader.readLine(); lineLinks = artikelInLinksReader.readLine(); } rawTextReader.close(); artikelInLinksReader.close(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.commit(); writer.forceMerge(1); writer.close(); Date end = new Date(); String endStatement = end.getTime() - start.getTime() + " total milliseconds (" + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles."; logger.println(endStatement); System.out.println(endStatement); logger.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:IndexAndSearchOpenStreetMaps1D.java
License:Apache License
private static void createIndex() throws IOException { long t0 = System.nanoTime(); CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); int BUFFER_SIZE = 1 << 16; // 64K InputStream is = Files .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt")); BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : ""))); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //iwc.setMaxBufferedDocs(109630); //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); iwc.setRAMBufferSizeMB(256.0); iwc.setMergePolicy(new LogDocMergePolicy()); iwc.setMergeScheduler(new SerialMergeScheduler()); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); IndexWriter w = new IndexWriter(dir, iwc); int count = 0; byte[] scratch = new byte[4]; while (true) { String line = reader.readLine(); if (line == null) { break; }//from ww w . j av a2 s. c om String[] parts = line.split(","); //long id = Long.parseLong(parts[0]); int lat = (int) (1000000. * Double.parseDouble(parts[1])); //int lon = (int) (1000000. * Double.parseDouble(parts[2])); Document doc = new Document(); if (USE_NF) { doc.add(new LegacyIntField("latnum", lat, Field.Store.NO)); //doc.add(new LongField("lonnum", lon, Field.Store.NO)); } else { doc.add(new IntPoint("lat", lat)); //doc.add(new SortedNumericDocValuesField("lon", lon)); } w.addDocument(doc); count++; if (count % 1000000 == 0) { System.out.println(count + "..."); } } //w.forceMerge(1); w.commit(); System.out.println(w.maxDoc() + " total docs"); w.close(); long t1 = System.nanoTime(); System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index"); }
From source file:IndexTaxis.java
License:Apache License
public static void main(String[] args) throws Exception { Path indexPath = Paths.get(args[0]); Directory dir = FSDirectory.open(indexPath); int threadCount = Integer.parseInt(args[1]); Path docsPath = Paths.get(args[2]); IndexWriterConfig iwc = new IndexWriterConfig(); //System.out.println("NOW SET INFO STREAM"); iwc.setRAMBufferSizeMB(1024.); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).disableAutoIOThrottle(); final IndexWriter w = new IndexWriter(dir, iwc); BufferedInputStream docs = new BufferedInputStream(Files.newInputStream(docsPath, StandardOpenOption.READ)); // parse the header fields List<String> fieldsList = new ArrayList<>(); StringBuilder builder = new StringBuilder(); while (true) { int x = docs.read(); if (x == -1) { throw new IllegalArgumentException( "hit EOF while trying to read CSV header; are you sure you have the right CSV file!"); }//from w w w . j a v a2 s . c o m byte b = (byte) x; if (b == NEWLINE) { fieldsList.add(builder.toString()); break; } else if (b == COMMA) { fieldsList.add(builder.toString()); builder.setLength(0); } else { // this is OK because headers are all ascii: builder.append((char) b); } } final String[] fields = fieldsList.toArray(new String[fieldsList.size()]); Thread[] threads = new Thread[threadCount]; final AtomicInteger docCounter = new AtomicInteger(); final AtomicLong bytesCounter = new AtomicLong(); startNS = System.nanoTime(); for (int i = 0; i < threadCount; i++) { final int threadID = i; threads[i] = new Thread() { @Override public void run() { try { _run(); } catch (Exception e) { throw new RuntimeException(e); } } private void _run() throws IOException { while (true) { Chunk chunk = readChunk(docs); if (chunk == null) { break; } indexOneChunk(fields, chunk, w, docCounter, bytesCounter); } } }; threads[i].start(); } for (int i = 0; i < threadCount; i++) { threads[i].join(); } System.out.println("Indexing done; now close"); w.close(); docs.close(); }
From source file:apps.LuceneIndexer.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption("i", null, true, "input file"); options.addOption("o", null, true, "output directory"); options.addOption("r", null, true, "optional output TREC-format QREL file"); options.addOption("bm25_b", null, true, "BM25 parameter: b"); options.addOption("bm25_k1", null, true, "BM25 parameter: k1"); options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity"); Joiner commaJoin = Joiner.on(','); Joiner spaceJoin = Joiner.on(' '); options.addOption("source_type", null, true, "document source type: " + commaJoin.join(SourceFactory.getDocSourceList())); // If you increase this value, you may need to modify the following line in *.sh file // export MAVEN_OPTS="-Xms8192m -server" double ramBufferSizeMB = 1024 * 8; // 8 GB CommandLineParser parser = new org.apache.commons.cli.GnuParser(); IndexWriter indexWriter = null;//from w ww .j a v a 2s . c o m BufferedWriter qrelWriter = null; int docNum = 0; try { CommandLine cmd = parser.parse(options, args); String inputFileName = null, outputDirName = null, qrelFileName = null; if (cmd.hasOption("i")) { inputFileName = cmd.getOptionValue("i"); } else { Usage("Specify 'input file'", options); } if (cmd.hasOption("o")) { outputDirName = cmd.getOptionValue("o"); } else { Usage("Specify 'index directory'", options); } if (cmd.hasOption("r")) { qrelFileName = cmd.getOptionValue("r"); } String sourceName = cmd.getOptionValue("source_type"); if (sourceName == null) Usage("Specify document source type", options); if (qrelFileName != null) qrelWriter = new BufferedWriter(new FileWriter(qrelFileName)); File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } boolean useFixedBM25 = cmd.hasOption("bm25fixed"); float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT; if (cmd.hasOption("bm25_k1")) { try { bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_k1'", options); } } if (cmd.hasOption("bm25_b")) { try { bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b")); } catch (NumberFormatException e) { Usage("Wrong format for 'bm25_b'", options); } } EnglishAnalyzer analyzer = new EnglishAnalyzer(); FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName)); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer); /* OpenMode.CREATE creates a new index or overwrites an existing one. https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE */ indexConf.setOpenMode(OpenMode.CREATE); indexConf.setRAMBufferSizeMB(ramBufferSizeMB); System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b)); if (useFixedBM25) { System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b)); } else { System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b)); indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b)); } indexWriter = new IndexWriter(indexDir, indexConf); DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName); DocumentEntry inpDoc = null; TextCleaner textCleaner = new TextCleaner(null); while ((inpDoc = inpDocSource.next()) != null) { ++docNum; Document luceneDoc = new Document(); ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText); String cleanText = spaceJoin.join(cleanedToks); // System.out.println(inpDoc.mDocId); // System.out.println(cleanText); // System.out.println("=============================="); luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES)); luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES)); indexWriter.addDocument(luceneDoc); if (inpDoc.mIsRel != null && qrelWriter != null) { saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0); } if (docNum % 1000 == 0) System.out.println(String.format("Indexed %d documents", docNum)); } } catch (ParseException e) { e.printStackTrace(); Usage("Cannot parse arguments" + e, options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } finally { System.out.println(String.format("Indexed %d documents", docNum)); try { if (null != indexWriter) indexWriter.close(); if (null != qrelWriter) qrelWriter.close(); } catch (IOException e) { System.err.println("IO exception: " + e); e.printStackTrace(); } } }
From source file:biospectra.index.Indexer.java
License:Apache License
private void initialize(File indexPath, int kmerSize, boolean minStrandKmer, Similarity similarity, int workerThreads, int ramBufferSize) throws Exception { if (!indexPath.exists()) { indexPath.mkdirs();//w ww . ja v a 2 s . c o m } if (indexPath.exists()) { cleanUpDirectory(indexPath); } this.indexPath = indexPath; this.minStrandKmer = minStrandKmer; this.analyzer = new KmerIndexAnalyzer(kmerSize, minStrandKmer); Directory dir = new MMapDirectory(this.indexPath.toPath()); IndexWriterConfig config = new IndexWriterConfig(this.analyzer); if (similarity != null) { config.setSimilarity(similarity); } this.workerThreads = workerThreads; if (ramBufferSize > 0) { config.setRAMBufferSizeMB(ramBufferSize); } config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); this.indexWriter = new IndexWriter(dir, config); this.executor = new BlockingExecutor(this.workerThreads, this.workerThreads * 2); for (int i = 0; i < this.workerThreads; i++) { Document doc = new Document(); Field filenameField = new StringField(IndexConstants.FIELD_FILENAME, "", Field.Store.YES); Field headerField = new StringField(IndexConstants.FIELD_HEADER, "", Field.Store.YES); Field sequenceDirectionField = new StringField(IndexConstants.FIELD_SEQUENCE_DIRECTION, "", Field.Store.YES); Field taxonTreeField = new StringField(IndexConstants.FIELD_TAXONOMY_TREE, "", Field.Store.YES); Field sequenceField = new TextField(IndexConstants.FIELD_SEQUENCE, "", Field.Store.NO); doc.add(filenameField); doc.add(headerField); doc.add(sequenceDirectionField); doc.add(taxonTreeField); doc.add(sequenceField); this.freeQueue.offer(doc); } }
From source file:cn.hbu.cs.esearch.index.DiskSearchIndex.java
License:Apache License
/** * Opens an index modifier.//from w w w. j av a 2 s. com * @param analyzer Analyzer * @return IndexModifer instance */ @Override public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException { if (_indexWriter != null) { return _indexWriter; } Directory directory = _dirMgr.getDirectory(true); log.info("opening index writer at: " + _dirMgr.getPath()); EsearchMergePolicy mergePolicy = new EsearchMergePolicy(); mergePolicy.setMergePolicyParams(_mergePolicyParams); // hao: autocommit is set to false with this constructor IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); _deletionPolicy = new ZoieIndexDeletionPolicy(); config.setIndexDeletionPolicy(_deletionPolicy); config.setMergeScheduler(_mergeScheduler); config.setMergePolicy(mergePolicy); config.setReaderPooling(false); if (similarity != null) { config.setSimilarity(similarity); } config.setRAMBufferSizeMB(5); IndexWriter idxWriter = new IndexWriter(directory, config); // we need retrieve deletionPolicy from IndexWriter since deletionPolicy is deep cloned _deletionPolicy = (ZoieIndexDeletionPolicy) (idxWriter.getConfig().getIndexDeletionPolicy()); _indexWriter = idxWriter; return idxWriter; }
From source file:cn.hbu.cs.esearch.index.RAMSearchIndex.java
License:Apache License
@Override public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException { if (_indexWriter != null) { return _indexWriter; }/*w ww. ja v a 2 s. c o m*/ EsearchMergePolicy mergePolicy = new EsearchMergePolicy(); mergePolicy.setMergePolicyParams(_mergePolicyParams); mergePolicy.setUseCompoundFile(false); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setMergeScheduler(_mergeScheduler); config.setMergePolicy(mergePolicy); config.setReaderPooling(false); if (similarity != null) { config.setSimilarity(similarity); } config.setRAMBufferSizeMB(3); IndexWriter idxWriter = new IndexWriter(_directory, config); _indexWriter = idxWriter; return idxWriter; }
From source file:com.aliasi.lingmed.medline.IndexMedline.java
License:Lingpipe license
/** * Run the command. See class documentation above for details on * arguments and behavior.//from w w w. j a va2 s.co m */ public void run() { System.out.println("start run"); try { File[] files = getLaterFiles(mDistDir); System.out.println("Total files to process: " + files.length); System.out.println("File names: " + java.util.Arrays.asList(files)); // if (mLogger.isDebugEnabled()) // mLogger.debug("File names: " + java.util.Arrays.asList(files)); if (files.length > 0) { MedlineParser parser = new MedlineParser(true); // true = save raw XML Directory fsDir = FSDirectory.open(mIndex); IndexWriterConfig iwConf = new IndexWriterConfig(Version.LUCENE_36, mCodec.getAnalyzer()); iwConf.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); iwConf.setRAMBufferSizeMB(RAM_BUF_SIZE); if (sIsBaseline) { LogDocMergePolicy ldmp = new LogDocMergePolicy(); ldmp.setMergeFactor(MERGE_FACTOR_HI); iwConf.setMergePolicy(ldmp); } IndexWriter indexWriter = new IndexWriter(fsDir, iwConf); for (File file : files) { System.out.println("processing file: " + file); MedlineIndexer indexer = new MedlineIndexer(indexWriter, mCodec); parser.setHandler(indexer); parseFile(parser, file); indexer.close(); recordFile(indexWriter, file.getName()); System.out.println("completed processing file: " + file); } System.out.println("All files parsed, now optimize index"); indexWriter.forceMerge(1); indexWriter.commit(); indexWriter.close(); } System.out.println("Processing complete."); } catch (Exception e) { // mLogger.warn("Unexpected Exception: "+e.getMessage()); // mLogger.warn("stack trace: "+Logging.logStackTrace(e)); // mLogger.warn("Aborting this run"); IllegalStateException e2 = new IllegalStateException(e.getMessage()); e2.setStackTrace(e.getStackTrace()); throw e2; } }
From source file:com.baidu.rigel.biplatform.tesseract.isservice.index.service.IndexWriterFactory.java
License:Open Source License
/** * /* ww w. ja v a 2s. c om*/ * getIndexWriter * * @param idxPath * * @return IndexWriter * @throws IOException * IO */ public static synchronized IndexWriter getIndexWriter(String idxPath) throws IOException { LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_BEGIN, "getIndexWriter", "[idxPath:" + idxPath + "]")); IndexWriter indexWriter = null; if (INSTANCE.idxWriterMaps.containsKey(idxPath)) { indexWriter = INSTANCE.idxWriterMaps.get(idxPath); LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_PROCESS_NO_PARAM, "getIndexWriter", "return exist IndexWriter ")); } else { File indexFile = new File(idxPath); Directory directory = FSDirectory.open(indexFile); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_1, new StandardAnalyzer()); indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); indexWriterConfig.setRAMBufferSizeMB(64.0); indexWriterConfig.setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH); indexWriter = new IndexWriter(directory, indexWriterConfig); INSTANCE.idxWriterMaps.put(idxPath, indexWriter); LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_PROCESS_NO_PARAM, "getIndexWriter", "create new IndexWriter ")); } LOGGER.info(String.format(LogInfoConstants.INFO_PATTERN_FUNCTION_END, "getIndexWriter", "[idxPath:" + idxPath + "]")); return indexWriter; }