List of usage examples for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB
@Override public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB)
From source file:org.opensolaris.opengrok.index.IndexDatabase.java
License:Open Source License
/** * Update the content of this index database * * @throws IOException if an error occurs * @throws HistoryException if an error occurs when accessing the history *///from w ww. j ava 2s.c o m public void update() throws IOException, HistoryException { synchronized (lock) { if (running) { throw new IOException("Indexer already running!"); } running = true; interrupted = false; } String ctgs = RuntimeEnvironment.getInstance().getCtags(); if (ctgs != null) { ctags = new Ctags(); ctags.setBinary(ctgs); } if (ctags == null) { log.severe("Unable to run ctags! searching definitions will not work!"); } if (ctags != null) { String filename = RuntimeEnvironment.getInstance().getCTagsExtraOptionsFile(); if (filename != null) { ctags.setCTagsExtraOptionsFile(filename); } } try { Analyzer analyzer = AnalyzerGuru.getAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setRAMBufferSizeMB(RuntimeEnvironment.getInstance().getRamBufferSize()); writer = new IndexWriter(indexDirectory, iwc); writer.commit(); // to make sure index exists on the disk if (directories.isEmpty()) { if (project == null) { directories.add(""); } else { directories.add(project.getPath()); } } for (String dir : directories) { File sourceRoot; if ("".equals(dir)) { sourceRoot = RuntimeEnvironment.getInstance().getSourceRootFile(); } else { sourceRoot = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), dir); } HistoryGuru.getInstance().ensureHistoryCacheExists(sourceRoot); String startuid = Util.path2uid(dir, ""); IndexReader reader = DirectoryReader.open(indexDirectory); // open existing index Terms terms = null; int numDocs = reader.numDocs(); if (numDocs > 0) { Fields uFields = MultiFields.getFields(reader);//reader.getTermVectors(0); terms = uFields.terms(QueryBuilder.U); } try { if (numDocs > 0) { uidIter = terms.iterator(uidIter); TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid)); //init uid if (stat == TermsEnum.SeekStatus.END) { uidIter = null; log.log(Level.WARNING, "Couldn't find a start term for {0}, empty u field?", startuid); } } // The code below traverses the tree to get total count. int file_cnt = 0; if (RuntimeEnvironment.getInstance().isPrintProgress()) { log.log(Level.INFO, "Counting files in {0} ...", dir); file_cnt = indexDown(sourceRoot, dir, true, 0, 0); if (log.isLoggable(Level.INFO)) { log.log(Level.INFO, "Need to process: {0} files for {1}", new Object[] { file_cnt, dir }); } } indexDown(sourceRoot, dir, false, 0, file_cnt); while (uidIter != null && uidIter.term() != null && uidIter.term().utf8ToString().startsWith(startuid)) { removeFile(); BytesRef next = uidIter.next(); if (next == null) { uidIter = null; } } } finally { reader.close(); } } } finally { if (writer != null) { try { writer.prepareCommit(); writer.commit(); writer.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing writer", e); } } if (ctags != null) { try { ctags.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing ctags process", e); } } synchronized (lock) { running = false; } } if (!isInterrupted() && isDirty()) { if (RuntimeEnvironment.getInstance().isOptimizeDatabase()) { optimize(); } RuntimeEnvironment env = RuntimeEnvironment.getInstance(); File timestamp = new File(env.getDataRootFile(), "timestamp"); String purpose = "used for timestamping the index database."; if (timestamp.exists()) { if (!timestamp.setLastModified(System.currentTimeMillis())) { log.log(Level.WARNING, "Failed to set last modified time on ''{0}'', {1}", new Object[] { timestamp.getAbsolutePath(), purpose }); } } else { if (!timestamp.createNewFile()) { log.log(Level.WARNING, "Failed to create file ''{0}'', {1}", new Object[] { timestamp.getAbsolutePath(), purpose }); } } } }
From source file:org.simple.nlp.dictionary.index.IndexEngine.java
License:Open Source License
public IndexEngine(String dirPath) throws IOException { File file = new File(dirPath); if (!file.exists()) file.mkdirs();/*from www .ja va 2 s .co m*/ this.dir = FSDirectory.open(file); if (IndexWriter.isLocked(dir)) IndexWriter.unlock(dir); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_46, new CustomAnalyzer()); conf.setRAMBufferSizeMB(256); this.writer = new IndexWriter(dir, conf); }
From source file:org.sonatype.timeline.internal.DefaultTimelineIndexer.java
License:Open Source License
protected void start(final TimelineConfiguration configuration) throws IOException { closeIndexWriter();/*from w w w.j a v a 2s .com*/ if (directory != null) { directory.close(); } directory = FSDirectory.open(configuration.getIndexDirectory()); if (IndexReader.indexExists(directory)) { if (IndexWriter.isLocked(directory)) { IndexWriter.unlock(directory); } } final IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, new StandardAnalyzer(Version.LUCENE_36)); config.setMergeScheduler(new SerialMergeScheduler()); config.setRAMBufferSizeMB(2.0); indexWriter = new IndexWriter(directory, config); indexWriter.commit(); searcherManager = new SearcherManager(indexWriter, false, new SearcherFactory()); generation = generation + 1; }
From source file:org.wikimedia.revdiffsearch.IndexMerger.java
License:Open Source License
/** Index all text files under a directory. */ public static void main(String[] args) { if (args.length != 2) { System.out.println("Usage: java -jar IndexMerger.jar " + "merged_index_dir existing_index_dir1 existing_index_dir2 ..."); System.out.println(" merged_index_dir: A directory where the merged " + "index will be stored"); System.out.println(" e.g. merged_indexes"); System.out//from w ww .java 2 s. co m .println(" existing_indexes_dir: A directory where the " + "indexes that have to merged exist"); System.out.println(" e.g. indexes/"); System.out.println(" e.g. index1"); System.out.println(" e.g. index2"); System.out.println(" e.g. index3"); System.exit(1); } int ramBufferSizeMB = 1024; int ngram = 3; File INDEX_DIR = new File(args[0]); { String s; if ((s = System.getProperty("ngram")) != null) { ngram = Integer.parseInt(s); } if ((s = System.getProperty("ramBufferSize")) != null) { ramBufferSizeMB = Integer.parseInt(s); } } INDEX_DIR.mkdir(); Date start = new Date(); try { IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_44, new SimpleNGramAnalyzer(ngram)); LogDocMergePolicy lmp = new LogDocMergePolicy(); lmp.setMergeFactor(1000); cfg.setRAMBufferSizeMB(ramBufferSizeMB); cfg.setMergePolicy(lmp); IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_DIR), cfg); // IndexWriter writer = new IndexWriter(INDEX_DIR, // new StandardAnalyzer(Version.LUCENE_44), // true); // writer.setMergeFactor(1000); // writer.setRAMBufferSizeMB(50); List<Directory> indexes = new ArrayList<Directory>(); for (String indexdir : Arrays.asList(args).subList(1, args.length)) { System.out.println("Adding: " + indexdir); indexes.add(FSDirectory.open(new File(indexdir))); } System.out.print("Merging added indexes..."); writer.addIndexes(indexes.toArray(new Directory[indexes.size()])); System.out.println("done"); writer.close(); System.out.println("done"); Date end = new Date(); System.out.println("It took: " + ((end.getTime() - start.getTime()) / 1000) + "\""); } catch (IOException e) { e.printStackTrace(); } }
From source file:org.zenoss.zep.index.impl.lucene.LuceneEventIndexMapper.java
License:Open Source License
public static IndexWriterConfig createIndexWriterConfig(Analyzer analyzer, ZepInstance zepInstance) { IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LUCENE_VERSION, analyzer); Map<String, String> cfg = zepInstance.getConfig(); String ramBufferSizeMb = cfg.get("zep.index.ram_buffer_size_mb"); if (ramBufferSizeMb != null) { try {/*from ww w .j av a 2s.c o m*/ indexWriterConfig.setRAMBufferSizeMB(Double.valueOf(ramBufferSizeMb.trim())); } catch (NumberFormatException nfe) { logger.warn("Invalid value for zep.index.ram_buffer_size_mb: {}", ramBufferSizeMb); } } return indexWriterConfig; }
From source file:perf.AutoPrefixPerf.java
License:Apache License
public static void main(String[] args) throws Exception { String numbersFile = args[0]; String queriesFile = args[1]; Path indexPath = Paths.get(args[2]); int precStep = Integer.parseInt(args[3]); boolean useNumericField = (precStep != 0); int maxTermsInPrefix; int minTermsInPrefix; if (useNumericField == false) { minTermsInPrefix = Integer.parseInt(args[4]); maxTermsInPrefix = Integer.parseInt(args[5]); } else {// www. ja va 2s .c o m minTermsInPrefix = 0; maxTermsInPrefix = 0; } BytesRefBuilder binaryToken = new BytesRefBuilder(); binaryToken.grow(8); binaryToken.setLength(8); Directory dir = FSDirectory.open(indexPath); if (Files.notExists(indexPath) == false) { IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); iwc.setMaxBufferedDocs(30000); iwc.setRAMBufferSizeMB(-1); iwc.setMergePolicy(new LogDocMergePolicy()); final PostingsFormat pf; if (useNumericField) { // Disable auto-prefix when testing NumericField! if (minTermsInPrefix != 0) { throw new IllegalArgumentException("only precStep or minTermsInPrefix should be non-zero"); } pf = new Lucene50PostingsFormat(25, 48, 0, 0); } else { /* if (minTermsInPrefix == 0) { throw new IllegalArgumentException("one of precStep or minTermsInPrefix must be non-zero"); } */ pf = new Lucene50PostingsFormat(25, 48, minTermsInPrefix, maxTermsInPrefix); //pf = new Lucene50PostingsFormat(25, 48, minTermsInPrefix, Integer.MAX_VALUE); } iwc.setCodec(new Lucene53Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return pf; } }); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); iwc.setMergeScheduler(new SerialMergeScheduler()); //TieredMergePolicy tmp = (TieredMergePolicy) iwc.getMergePolicy(); //tmp.setFloorSegmentMB(.1); //ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler(); // More concurrency (for SSD) //cms.setMaxMergesAndThreads(5, 3); final IndexWriter w = new IndexWriter(dir, iwc); Document doc = new Document(); Field field; if (useNumericField) { FieldType longFieldType = new FieldType(LongField.TYPE_NOT_STORED); longFieldType.setNumericPrecisionStep(precStep); longFieldType.freeze(); field = new LongField("number", 0L, longFieldType); doc.add(field); } else { FieldType longFieldType = new FieldType(TextField.TYPE_NOT_STORED); longFieldType.setIndexOptions(IndexOptions.DOCS_ONLY); longFieldType.setOmitNorms(true); longFieldType.setIndexRanges(true); longFieldType.freeze(); field = new Field("number", new BinaryTokenStream(binaryToken.get()), longFieldType); doc.add(field); } long startMS = System.currentTimeMillis(); // 64K buffer: InputStream is = new FileInputStream(numbersFile); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); int count = 0; while (true) { String line = reader.readLine(); if (line == null) { break; } Long v = Long.parseLong(line.trim()); if (useNumericField) { field.setLongValue(v); } else { //NumericUtils.longToPrefixCoded(v, 0, binaryToken); longToBytes(v, binaryToken); //if (bytesToLong(binaryToken.get()) != v) { // throw new RuntimeException("wrong long: v=" + v + " vs " + bytesToLong(binaryToken.get())); //} } w.addDocument(doc); count++; if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println("Indexed " + count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } reader.close(); System.out.println( "Final Indexed " + count + ": " + ((System.currentTimeMillis() - startMS) / 1000.0) + " sec"); // nocommit just to make debugging easier: //System.out.println("Optimize..."); //w.forceMerge(1); System.out.println("Close..."); w.close(); System.out.println("After close: " + ((System.currentTimeMillis() - startMS) / 1000.0) + " sec"); // Print CheckIndex: ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); CheckIndex checker = new CheckIndex(dir); checker.setInfoStream(new PrintStream(bos, false, IOUtils.UTF_8), true); CheckIndex.Status status = checker.checkIndex(); System.out.println("Done CheckIndex:"); System.out.println(bos.toString(IOUtils.UTF_8)); if (status.clean == false) { throw new IllegalStateException("CheckIndex failed"); } SegmentInfos infos = new SegmentInfos(); infos.read(dir); long totBytes = 0; for (SegmentCommitInfo info : infos) { totBytes += info.sizeInBytes(); } System.out.println("\nTotal index size: " + totBytes + " bytes"); } else { System.out.println("Skip indexing: index already exists"); } List<Query> queries = new ArrayList<>(); InputStream is = new FileInputStream(queriesFile); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] numbers = line.trim().split(" "); if (numbers.length != 2) { throw new IllegalArgumentException("could not parse query line: " + line); } long minValue = Long.parseLong(numbers[0]); long maxValue = Long.parseLong(numbers[1]); if (useNumericField) { queries.add(NumericRangeQuery.newLongRange("number", precStep, minValue, maxValue, true, true)); } else { longToBytes(minValue, binaryToken); BytesRef minTerm = binaryToken.toBytesRef(); longToBytes(maxValue, binaryToken); BytesRef maxTerm = binaryToken.toBytesRef(); queries.add(new TermRangeQuery("number", minTerm, maxTerm, true, true)); } if (queries.size() == 200) { break; } } DirectoryReader r = DirectoryReader.open(dir); IndexSearcher s = new IndexSearcher(r); s.setQueryCache(null); // don't bench the cache printQueryTerms((MultiTermQuery) queries.get(0), s); long bestMS = Long.MAX_VALUE; for (int iter = 0; iter < 10; iter++) { long startMS = System.currentTimeMillis(); long totalHits = 0; long hash = 0; for (Query query : queries) { TopDocs hits = s.search(query, 10); totalHits += hits.totalHits; hash = hash * 31 + hits.totalHits; } long ms = System.currentTimeMillis() - startMS; System.out.println("iter " + iter + ": " + ms + " msec; totalHits=" + totalHits + " hash=" + hash); if (ms < bestMS) { System.out.println(" **"); bestMS = ms; } } /* long t0 = System.currentTimeMillis(); long bytesUsed = 0; for(int i=0;i<1000;i++) { for(AtomicReaderContext ctx : r.leaves()) { bytesUsed += ((SegmentReader) ctx.reader()).ramBytesUsed(); } } System.out.println((System.currentTimeMillis() - t0) + " msec for 1000 ramBytesUsed: " + (bytesUsed / 1000)); */ r.close(); dir.close(); }
From source file:perf.IDPerfTest.java
License:Apache License
private static Result testOne(String indexPath, String desc, IDIterator ids, final int minTermsInBlock, final int maxTermsInBlock) throws IOException { System.out.println("\ntest: " + desc + " termBlocks=" + minTermsInBlock + "/" + maxTermsInBlock); Directory dir = FSDirectory.open(new File(indexPath)); //IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, new StandardAnalyzer(Version.LUCENE_48)); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_8, new StandardAnalyzer(Version.LUCENE_4_8)); iwc.setMergeScheduler(new SerialMergeScheduler()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // So I can walk the files and get the *.tip sizes: iwc.setUseCompoundFile(false);/*ww w .ja v a2s . c o m*/ iwc.setCodec(new Lucene53Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return new Lucene50PostingsFormat(minTermsInBlock, maxTermsInBlock); } }); /// 7/7/7 segment structure: iwc.setMaxBufferedDocs(ID_COUNT / 777); iwc.setRAMBufferSizeMB(-1); //iwc.setInfoStream(new PrintStreamInfoStream(System.out)); //iwc.setMergePolicy(new LogDocMergePolicy()); ((TieredMergePolicy) iwc.getMergePolicy()).setFloorSegmentMB(.001); ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(0.0); //((LogDocMergePolicy) iwc.getMergePolicy()).setMinMergeDocs(1000); iwc.getMergePolicy().setNoCFSRatio(0.0); IndexWriter w = new IndexWriter(dir, iwc); Document doc = new Document(); FieldType ft = new FieldType(StringField.TYPE_NOT_STORED); ft.setTokenized(true); ft.freeze(); BytesRef idValue = new BytesRef(64); Field idField = new Field("id", new BinaryTokenStream(idValue), ft); doc.add(idField); long t0 = System.nanoTime(); BytesRef[] lookupIDs = new BytesRef[ID_SEARCH_COUNT]; Random random = new Random(17); int lookupCount = 0; double rate = 1.01 * ((double) ID_SEARCH_COUNT) / ID_COUNT; for (int i = 0; i < ID_COUNT; i++) { ids.next(idValue); if (lookupCount < lookupIDs.length && random.nextDouble() <= rate) { lookupIDs[lookupCount++] = BytesRef.deepCopyOf(idValue); } // Trickery: the idsIter changed the idValue which the BinaryTokenStream reuses for each added doc w.addDocument(doc); } if (lookupCount < lookupIDs.length) { throw new RuntimeException("didn't get enough lookup ids: " + lookupCount + " vs " + lookupIDs.length); } long indexTime = System.nanoTime() - t0; System.out.println(" indexing done; waitForMerges..."); w.waitForMerges(); IndexReader r = DirectoryReader.open(w, true); System.out.println(" reader=" + r); shuffle(random, lookupIDs); shuffle(random, lookupIDs); long bestTime = Long.MAX_VALUE; long checksum = 0; List<AtomicReaderContext> leaves = new ArrayList<>(r.leaves()); // Sort largest to smallest: Collections.sort(leaves, new Comparator<AtomicReaderContext>() { @Override public int compare(AtomicReaderContext c1, AtomicReaderContext c2) { return c2.reader().maxDoc() - c1.reader().maxDoc(); } }); TermsEnum[] termsEnums = new TermsEnum[leaves.size()]; DocsEnum[] docsEnums = new DocsEnum[leaves.size()]; int[] docBases = new int[leaves.size()]; for (int i = 0; i < leaves.size(); i++) { //System.out.println("i=" + i + " count=" + leaves.get(i).reader().maxDoc()); termsEnums[i] = leaves.get(i).reader().fields().terms("id").iterator(null); docBases[i] = leaves.get(i).docBase; } long rawLookupCount = 0; int countx = 0; for (int iter = 0; iter < 5; iter++) { t0 = System.nanoTime(); BlockTreeTermsReader.seekExactFastNotFound = 0; BlockTreeTermsReader.seekExactFastRootNotFound = 0; rawLookupCount = 0; for (BytesRef id : lookupIDs) { if (countx++ < 50) { System.out.println(" id=" + id); } boolean found = false; for (int seg = 0; seg < termsEnums.length; seg++) { rawLookupCount++; if (termsEnums[seg].seekExact(id)) { docsEnums[seg] = termsEnums[seg].docs(null, docsEnums[seg], 0); int docID = docsEnums[seg].nextDoc(); if (docID == DocsEnum.NO_MORE_DOCS) { // uh-oh! throw new RuntimeException("id not found: " + id); } // paranoia: checksum += docID + docBases[seg]; found = true; // Optimization vs MultiFields: we don't need to check any more segments since id is PK break; } } if (found == false) { // uh-oh! throw new RuntimeException("id not found: " + id); } } long lookupTime = System.nanoTime() - t0; System.out.println(String.format(Locale.ROOT, " iter=" + iter + " lookupTime=%.3f sec", lookupTime / 1000000000.0)); if (lookupTime < bestTime) { bestTime = lookupTime; System.out.println(" **"); } } long totalBytes = 0; long termsIndexTotalBytes = 0; for (String fileName : dir.listAll()) { long bytes = dir.fileLength(fileName); totalBytes += bytes; if (fileName.endsWith(".tip")) { termsIndexTotalBytes += bytes; } } r.close(); w.rollback(); dir.close(); return new Result(desc, ID_COUNT / (indexTime / 1000000.0), lookupIDs.length / (bestTime / 1000000.0), totalBytes, termsIndexTotalBytes, checksum, BlockTreeTermsReader.seekExactFastNotFound, BlockTreeTermsReader.seekExactFastRootNotFound, rawLookupCount, minTermsInBlock, maxTermsInBlock); }
From source file:perf.Index1BNumbers.java
License:Apache License
public static void main(String[] args) throws Exception { File indexPath = new File(args[0]); int numThreads = Integer.parseInt(args[1]); int precStep = Integer.parseInt(args[2]); if (indexPath.exists()) { throw new IllegalArgumentException("please remove indexPath \"" + indexPath + "\" before running"); }// w w w . j a v a 2 s .com Directory dir = FSDirectory.open(indexPath); IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); iwc.setRAMBufferSizeMB(512); final IndexWriter w = new IndexWriter(dir, iwc); final Field.Store store = Field.Store.NO; final FieldType longFieldType = new FieldType( store == Field.Store.NO ? LongField.TYPE_NOT_STORED : LongField.TYPE_STORED); longFieldType.setNumericPrecisionStep(precStep); longFieldType.freeze(); final MakeNumbers numbers = new MakeNumbers(); final long startMS = System.currentTimeMillis(); final AtomicInteger docsIndexed = new AtomicInteger(); Thread[] threads = new Thread[numThreads]; for (int i = 0; i < numThreads; i++) { threads[i] = new Thread() { @Override public void run() { Document doc = new Document(); Field field = new LongField("number", 0L, longFieldType); doc.add(field); while (true) { try { long v = numbers.next(); if (v >= numbers.end) { break; } field.setLongValue(v); w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec; " + v + " vs " + numbers.end + " (" + (numbers.end - v) + " left)"); } } catch (Exception e) { throw new RuntimeException(e); } } } }; threads[i].start(); } for (int i = 0; i < numThreads; i++) { threads[i].join(); } long ms = System.currentTimeMillis(); System.out.println(docsIndexed + ": " + ((ms - startMS) / 1000.0) + " sec"); //System.out.println("tot conflicts: " + BytesRefHash.totConflict); w.close(); dir.close(); }
From source file:perf.IndexAndSearchOpenStreetMaps.java
License:Apache License
private static void createIndex(boolean fast, boolean doForceMerge, boolean doDistanceSort) throws IOException, InterruptedException { CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); int BUFFER_SIZE = 1 << 16; // 64K InputStream is;/* w w w .ja va2 s. c o m*/ if (SMALL) { is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.subsetPlusAllLondon.txt")); } else { is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.txt")); } BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); int NUM_THREADS; if (fast) { NUM_THREADS = 4; } else { NUM_THREADS = 1; } int CHUNK = 10000; long t0 = System.nanoTime(); AtomicLong totalCount = new AtomicLong(); for (int part = 0; part < NUM_PARTS; part++) { Directory dir = FSDirectory.open(Paths.get(getName(part, doDistanceSort))); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setCodec(getCodec(fast)); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); if (fast) { ((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(Double.POSITIVE_INFINITY); iwc.setRAMBufferSizeMB(1024); } else { iwc.setMaxBufferedDocs(109630); iwc.setMergePolicy(new LogDocMergePolicy()); iwc.setMergeScheduler(new SerialMergeScheduler()); } iwc.setInfoStream(new PrintStreamInfoStream(System.out)); IndexWriter w = new IndexWriter(dir, iwc); Thread[] threads = new Thread[NUM_THREADS]; AtomicBoolean finished = new AtomicBoolean(); Object lock = new Object(); final int finalPart = part; for (int t = 0; t < NUM_THREADS; t++) { threads[t] = new Thread() { @Override public void run() { String[] lines = new String[CHUNK]; int chunkCount = 0; while (finished.get() == false) { try { int count = CHUNK; synchronized (lock) { for (int i = 0; i < CHUNK; i++) { String line = reader.readLine(); if (line == null) { count = i; finished.set(true); break; } lines[i] = line; } if (finalPart == 0 && totalCount.get() + count >= 2000000000) { finished.set(true); } } for (int i = 0; i < count; i++) { String[] parts = lines[i].split(","); //long id = Long.parseLong(parts[0]); double lat = Double.parseDouble(parts[1]); double lon = Double.parseDouble(parts[2]); Document doc = new Document(); if (useGeoPoint) { doc.add(new GeoPointField("point", lat, lon, Field.Store.NO)); } else if (useGeo3D || useGeo3DLarge) { doc.add(new Geo3DPoint("point", lat, lon)); } else { doc.add(new LatLonPoint("point", lat, lon)); if (doDistanceSort) { doc.add(new LatLonDocValuesField("point", lat, lon)); } } w.addDocument(doc); long x = totalCount.incrementAndGet(); if (x % 1000000 == 0) { System.out.println(x + "..."); } } chunkCount++; if (false && SMALL == false && chunkCount == 20000) { System.out.println("NOW BREAK EARLY"); break; } } catch (IOException ioe) { throw new RuntimeException(ioe); } } } }; threads[t].start(); } for (Thread thread : threads) { thread.join(); } System.out.println("Part " + part + " is done: w.maxDoc()=" + w.maxDoc()); w.commit(); System.out.println("done commit"); long t1 = System.nanoTime(); System.out.println(((t1 - t0) / 1000000000.0) + " sec to index part " + part); if (doForceMerge) { w.forceMerge(1); long t2 = System.nanoTime(); System.out.println(((t2 - t1) / 1000000000.0) + " sec to force merge part " + part); } w.close(); } //System.out.println(totalCount.get() + " total docs"); //System.out.println("Force merge..."); //w.forceMerge(1); //long t2 = System.nanoTime(); //System.out.println(((t2-t1)/1000000000.0) + " sec to force merge"); //w.close(); //long t3 = System.nanoTime(); //System.out.println(((t3-t2)/1000000000.0) + " sec to close"); //System.out.println(((t3-t2)/1000000000.0) + " sec to close"); }
From source file:perf.Indexer.java
License:Apache License
private static void _main(String[] clArgs) throws Exception { Args args = new Args(clArgs); // EG: -facets Date -facets characterCount ... FacetsConfig facetsConfig = new FacetsConfig(); facetsConfig.setHierarchical("Date", true); final Set<String> facetFields = new HashSet<String>(); if (args.hasArg("-facets")) { for (String arg : args.getStrings("-facets")) { facetFields.add(arg);/*www . ja v a2 s . c om*/ } } final String dirImpl = args.getString("-dirImpl"); final String dirPath = args.getString("-indexPath") + "/index"; final Directory dir; OpenDirectory od = OpenDirectory.get(dirImpl); dir = od.open(Paths.get(dirPath)); final String analyzer = args.getString("-analyzer"); final Analyzer a; if (analyzer.equals("EnglishAnalyzer")) { a = new EnglishAnalyzer(); } else if (analyzer.equals("StandardAnalyzer")) { a = new StandardAnalyzer(); } else if (analyzer.equals("StandardAnalyzerNoStopWords")) { a = new StandardAnalyzer(CharArraySet.EMPTY_SET); } else if (analyzer.equals("ShingleStandardAnalyzer")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(), 2, 2); } else if (analyzer.equals("ShingleStandardAnalyzerNoStopWords")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2); } else { throw new RuntimeException("unknown analyzer " + analyzer); } final String lineFile = args.getString("-lineDocsFile"); // -1 means all docs in the line file: final int docCountLimit = args.getInt("-docCountLimit"); final int numThreads = args.getInt("-threadCount"); final boolean doForceMerge = args.getFlag("-forceMerge"); final boolean verbose = args.getFlag("-verbose"); String indexSortField = null; SortField.Type indexSortType = null; if (args.hasArg("-indexSort")) { indexSortField = args.getString("-indexSort"); int i = indexSortField.indexOf(':'); if (i == -1) { throw new IllegalArgumentException( "-indexSort should have form field:type; got: " + indexSortField); } String typeString = indexSortField.substring(i + 1, indexSortField.length()); if (typeString.equals("long")) { indexSortType = SortField.Type.LONG; } else if (typeString.equals("string")) { indexSortType = SortField.Type.STRING; } else { throw new IllegalArgumentException("-indexSort can only handle 'long' sort; got: " + typeString); } indexSortField = indexSortField.substring(0, i); } else { indexSortType = null; } final double ramBufferSizeMB = args.getDouble("-ramBufferMB"); final int maxBufferedDocs = args.getInt("-maxBufferedDocs"); final String defaultPostingsFormat = args.getString("-postingsFormat"); final boolean doDeletions = args.getFlag("-deletions"); final boolean printDPS = args.getFlag("-printDPS"); final boolean waitForMerges = args.getFlag("-waitForMerges"); final boolean waitForCommit = args.getFlag("-waitForCommit"); final String mergePolicy = args.getString("-mergePolicy"); final Mode mode; final boolean doUpdate = args.getFlag("-update"); if (doUpdate) { mode = Mode.UPDATE; } else { mode = Mode.valueOf(args.getString("-mode", "add").toUpperCase(Locale.ROOT)); } int randomDocIDMax; if (mode == Mode.UPDATE) { randomDocIDMax = args.getInt("-randomDocIDMax"); } else { randomDocIDMax = -1; } final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat"); final boolean addGroupingFields = args.getFlag("-grouping"); final boolean useCFS = args.getFlag("-cfs"); final boolean storeBody = args.getFlag("-store"); final boolean tvsBody = args.getFlag("-tvs"); final boolean bodyPostingsOffsets = args.getFlag("-bodyPostingsOffsets"); final int maxConcurrentMerges = args.getInt("-maxConcurrentMerges"); final boolean addDVFields = args.getFlag("-dvfields"); final boolean doRandomCommit = args.getFlag("-randomCommit"); final boolean useCMS = args.getFlag("-useCMS"); final boolean disableIOThrottle = args.getFlag("-disableIOThrottle"); if (waitForCommit == false && waitForMerges) { throw new RuntimeException("pass -waitForCommit if you pass -waitForMerges"); } if (waitForCommit == false && doForceMerge) { throw new RuntimeException("pass -waitForCommit if you pass -forceMerge"); } if (waitForCommit == false && doDeletions) { throw new RuntimeException("pass -waitForCommit if you pass -deletions"); } if (useCMS == false && disableIOThrottle) { throw new RuntimeException("-disableIOThrottle only makes sense with -useCMS"); } final double nrtEverySec; if (args.hasArg("-nrtEverySec")) { nrtEverySec = args.getDouble("-nrtEverySec"); } else { nrtEverySec = -1.0; } // True to start back at the beginning if we run out of // docs from the line file source: final boolean repeatDocs = args.getFlag("-repeatDocs"); final String facetDVFormatName; if (facetFields.isEmpty()) { facetDVFormatName = "Lucene54"; } else { facetDVFormatName = args.getString("-facetDVFormat"); } if (addGroupingFields && docCountLimit == -1) { a.close(); throw new RuntimeException("cannot add grouping fields unless docCount is set"); } args.check(); System.out.println("Dir: " + dirImpl); System.out.println("Index path: " + dirPath); System.out.println("Analyzer: " + analyzer); System.out.println("Line file: " + lineFile); System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); System.out.println("Threads: " + numThreads); System.out.println("Force merge: " + (doForceMerge ? "yes" : "no")); System.out.println("Verbose: " + (verbose ? "yes" : "no")); System.out.println("RAM Buffer MB: " + ramBufferSizeMB); System.out.println("Max buffered docs: " + maxBufferedDocs); System.out.println("Default postings format: " + defaultPostingsFormat); System.out.println("Do deletions: " + (doDeletions ? "yes" : "no")); System.out.println("Wait for merges: " + (waitForMerges ? "yes" : "no")); System.out.println("Wait for commit: " + (waitForCommit ? "yes" : "no")); System.out.println("IO throttle: " + (disableIOThrottle ? "no" : "yes")); System.out.println("Merge policy: " + mergePolicy); System.out.println("Mode: " + mode); if (mode == Mode.UPDATE) { System.out.println("DocIDMax: " + randomDocIDMax); } System.out.println("ID field postings format: " + idFieldPostingsFormat); System.out.println("Add grouping fields: " + (addGroupingFields ? "yes" : "no")); System.out.println("Compound file format: " + (useCFS ? "yes" : "no")); System.out.println("Store body field: " + (storeBody ? "yes" : "no")); System.out.println("Term vectors for body field: " + (tvsBody ? "yes" : "no")); System.out.println("Facet DV Format: " + facetDVFormatName); System.out.println("Facet fields: " + facetFields); System.out.println("Body postings offsets: " + (bodyPostingsOffsets ? "yes" : "no")); System.out.println("Max concurrent merges: " + maxConcurrentMerges); System.out.println("Add DocValues fields: " + addDVFields); System.out.println("Use ConcurrentMergeScheduler: " + useCMS); if (nrtEverySec > 0.0) { System.out.println("Open & close NRT reader every: " + nrtEverySec + " sec"); } else { System.out.println("Open & close NRT reader every: never"); } System.out.println("Repeat docs: " + repeatDocs); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } final IndexWriterConfig iwc = new IndexWriterConfig(a); if (indexSortField != null) { iwc.setIndexSort(new Sort(new SortField(indexSortField, indexSortType))); } if (mode == Mode.UPDATE) { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } iwc.setMaxBufferedDocs(maxBufferedDocs); iwc.setRAMBufferSizeMB(ramBufferSizeMB); // So flushed segments do/don't use CFS: iwc.setUseCompoundFile(useCFS); final AtomicBoolean indexingFailed = new AtomicBoolean(); iwc.setMergeScheduler(getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle)); iwc.setMergePolicy(getMergePolicy(mergePolicy, useCFS)); // Keep all commit points: if (doDeletions || doForceMerge) { iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); } final Codec codec = new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return PostingsFormat.forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat); } private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName(facetDVFormatName); //private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42"); //private final DocValuesFormat diskDVFormat = DocValuesFormat.forName("Disk"); // private final DocValuesFormat lucene45DVFormat = DocValuesFormat.forName("Lucene45"); private final DocValuesFormat directDVFormat = DocValuesFormat.forName("Direct"); @Override public DocValuesFormat getDocValuesFormatForField(String field) { if (facetFields.contains(field) || field.equals("$facets")) { return facetsDVFormat; //} else if (field.equals("$facets_sorted_doc_values")) { //return diskDVFormat; } else { // Use default DVFormat for all else: // System.out.println("DV: field=" + field + " format=" + super.getDocValuesFormatForField(field)); return super.getDocValuesFormatForField(field); } } }; iwc.setCodec(codec); System.out.println("IW config=" + iwc); IndexWriter w = new IndexWriter(dir, iwc); System.out.println("Index has " + w.maxDoc() + " docs"); final TaxonomyWriter taxoWriter; if (facetFields.isEmpty() == false) { taxoWriter = new DirectoryTaxonomyWriter(od.open(Paths.get(args.getString("-indexPath"), "facets")), IndexWriterConfig.OpenMode.CREATE); } else { taxoWriter = null; } // Fixed seed so group field values are always consistent: final Random random = new Random(17); LineFileDocs lineFileDocs = new LineFileDocs(lineFile, repeatDocs, storeBody, tvsBody, bodyPostingsOffsets, false, taxoWriter, facetFields, facetsConfig, addDVFields); float docsPerSecPerThread = -1f; //float docsPerSecPerThread = 100f; IndexThreads threads = new IndexThreads(random, w, indexingFailed, lineFileDocs, numThreads, docCountLimit, addGroupingFields, printDPS, mode, docsPerSecPerThread, null, nrtEverySec, randomDocIDMax); System.out.println("\nIndexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done() && indexingFailed.get() == false) { Thread.sleep(100); // Commits once per minute on average: if (doRandomCommit && random.nextInt(600) == 17) { System.out.println("Indexer: now commit"); long commitStartNS = System.nanoTime(); w.commit(); System.out.println(String.format(Locale.ROOT, "Indexer: commit took %.1f msec", (System.nanoTime() - commitStartNS) / 1000000.)); } } threads.stop(); final long t1 = System.currentTimeMillis(); System.out.println("\nIndexer: indexing done (" + (t1 - t0) + " msec); total " + w.maxDoc() + " docs"); // if we update we can not tell how many docs if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } if (mode != Mode.UPDATE && docCountLimit != -1 && w.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit); } final Map<String, String> commitData = new HashMap<String, String>(); if (waitForMerges) { w.close(); IndexWriterConfig iwc2 = new IndexWriterConfig(a); iwc2.setMergeScheduler( getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle)); iwc2.setMergePolicy(getMergePolicy(mergePolicy, useCFS)); iwc2.setCodec(codec); iwc2.setUseCompoundFile(useCFS); iwc2.setMaxBufferedDocs(maxBufferedDocs); iwc2.setRAMBufferSizeMB(ramBufferSizeMB); if (indexSortField != null) { iwc2.setIndexSort(new Sort(new SortField(indexSortField, indexSortType))); } w = new IndexWriter(dir, iwc2); long t2 = System.currentTimeMillis(); System.out.println("\nIndexer: waitForMerges done (" + (t2 - t1) + " msec)"); } if (waitForCommit) { commitData.put("userData", "multi"); w.setLiveCommitData(commitData.entrySet()); long t2 = System.currentTimeMillis(); w.commit(); long t3 = System.currentTimeMillis(); System.out.println("\nIndexer: commit multi (took " + (t3 - t2) + " msec)"); } else { w.rollback(); w = null; } if (doForceMerge) { long forceMergeStartMSec = System.currentTimeMillis(); w.forceMerge(1); long forceMergeEndMSec = System.currentTimeMillis(); System.out.println( "\nIndexer: force merge done (took " + (forceMergeEndMSec - forceMergeStartMSec) + " msec)"); commitData.put("userData", "single"); w.setLiveCommitData(commitData.entrySet()); w.commit(); final long t5 = System.currentTimeMillis(); System.out.println("\nIndexer: commit single done (took " + (t5 - forceMergeEndMSec) + " msec)"); } if (doDeletions) { final long t5 = System.currentTimeMillis(); // Randomly delete 5% of the docs final Set<Integer> deleted = new HashSet<Integer>(); final int maxDoc = w.maxDoc(); final int toDeleteCount = (int) (maxDoc * 0.05); System.out.println("\nIndexer: delete " + toDeleteCount + " docs"); while (deleted.size() < toDeleteCount) { final int id = random.nextInt(maxDoc); if (!deleted.contains(id)) { deleted.add(id); w.deleteDocuments(new Term("id", LineFileDocs.intToID(id))); } } final long t6 = System.currentTimeMillis(); System.out.println("\nIndexer: deletes done (took " + (t6 - t5) + " msec)"); commitData.put("userData", doForceMerge ? "delsingle" : "delmulti"); w.setLiveCommitData(commitData.entrySet()); w.commit(); final long t7 = System.currentTimeMillis(); System.out.println("\nIndexer: commit delmulti done (took " + (t7 - t6) + " msec)"); if (doUpdate || w.numDocs() != maxDoc - toDeleteCount) { throw new RuntimeException( "count mismatch: w.numDocs()=" + w.numDocs() + " but expected " + (maxDoc - toDeleteCount)); } } if (taxoWriter != null) { System.out.println("Taxonomy has " + taxoWriter.getSize() + " ords"); taxoWriter.commit(); taxoWriter.close(); } final long tCloseStart = System.currentTimeMillis(); if (w != null) { w.close(); w = null; } if (waitForCommit) { System.out.println("\nIndexer: at close: " + SegmentInfos.readLatestCommit(dir)); System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart) + " msec"); } dir.close(); final long tFinal = System.currentTimeMillis(); System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed()); final long indexingTime; if (waitForCommit) { indexingTime = tFinal - t0; System.out.println("\nIndexer: finished (" + indexingTime + " msec)"); } else { indexingTime = t1 - t0; System.out.println("\nIndexer: finished (" + indexingTime + " msec), excluding commit"); } System.out.println( "\nIndexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / (indexingTime / 3600000.)) + " GB/hour plain text"); }