List of usage examples for org.apache.lucene.index IndexWriterConfig setRAMBufferSizeMB
@Override public IndexWriterConfig setRAMBufferSizeMB(double ramBufferSizeMB)
From source file:perf.IndexGeoNames.java
License:Apache License
public static void main(String[] args) throws Exception { String geoNamesFile = args[0]; File indexPath = new File(args[1]); int numThreads = Integer.parseInt(args[2]); int precStep = Integer.parseInt(args[3]); if (indexPath.exists()) { throw new IllegalArgumentException("please remove indexPath \"" + indexPath + "\" before running"); }/*from w w w .j ava 2s . c om*/ Directory dir = FSDirectory.open(indexPath); //IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, new StandardAnalyzer(Version.LUCENE_48)); IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); //iwc.setRAMBufferSizeMB(350); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); if (normal == false) { iwc.setRAMBufferSizeMB(1024); iwc.setMergePolicy(NoMergePolicy.INSTANCE); //iwc.setMergePolicy(NoMergePolicy.NO_COMPOUND_FILES); } else { // 5/5 segments: iwc.setMaxBufferedDocs(157234); iwc.setRAMBufferSizeMB(-1); } //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).setMaxMergesAndThreads(3, 1); final IndexWriter w = new IndexWriter(dir, iwc); final Field.Store store = Field.Store.NO; final FieldType doubleFieldType = new FieldType( store == Field.Store.NO ? DoubleField.TYPE_NOT_STORED : DoubleField.TYPE_STORED); doubleFieldType.setNumericPrecisionStep(precStep); doubleFieldType.freeze(); final FieldType longFieldType = new FieldType( store == Field.Store.NO ? LongField.TYPE_NOT_STORED : LongField.TYPE_STORED); longFieldType.setNumericPrecisionStep(precStep); longFieldType.freeze(); final FieldType intFieldType = new FieldType( store == Field.Store.NO ? IntField.TYPE_NOT_STORED : IntField.TYPE_STORED); intFieldType.setNumericPrecisionStep(precStep); intFieldType.freeze(); // 64K buffer: InputStream is = new FileInputStream(geoNamesFile); final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); final AtomicInteger docsIndexed = new AtomicInteger(); final long startMS = System.currentTimeMillis(); Thread[] threads = new Thread[numThreads]; // With reuse it's ~ 38% faster (41.8 sec vs 67.0 sec): final boolean reuseDocAndFields = false; for (int i = 0; i < numThreads; i++) { threads[i] = new Thread() { @Override public void run() { ParsePosition datePos = new ParsePosition(0); SimpleDateFormat dateParser = new SimpleDateFormat("yyyy-MM-dd", Locale.US); if (reuseDocAndFields) { Document doc = new Document(); IntField geoNameID = new IntField("geoNameID", 0, intFieldType); doc.add(geoNameID); TextField nameField = new TextField("name", "", store); doc.add(nameField); TextField asciiNameField = new TextField("asciiName", "", store); doc.add(asciiNameField); TextField alternateNameField = new TextField("alternateNames", "", store); doc.add(alternateNameField); StringField featureClassField = new StringField("featureClass", "", store); doc.add(featureClassField); StringField featureCodeField = new StringField("featureCode", "", store); doc.add(featureCodeField); StringField countryCodeField = new StringField("countryCode", "", store); doc.add(countryCodeField); StringField cc2Field = new StringField("cc2", "", store); doc.add(cc2Field); StringField admin1Field = new StringField("admin1", "", store); doc.add(admin1Field); StringField admin2Field = new StringField("admin2", "", store); doc.add(admin2Field); StringField admin3Field = new StringField("admin3", "", store); doc.add(admin3Field); StringField admin4Field = new StringField("admin4", "", store); doc.add(admin4Field); StringField tzField = new StringField("timezone", "", store); doc.add(tzField); while (true) { try { // Curiously BufferedReader.readLine seems to be thread-safe... String line = reader.readLine(); if (line == null) { break; } String[] values = line.split("\t"); geoNameID.setIntValue(Integer.parseInt(values[0])); nameField.setStringValue(values[1]); asciiNameField.setStringValue(values[2]); alternateNameField.setStringValue(values[3]); /* if (values[4].isEmpty() == false) { double v = Double.parseDouble(values[4]); doc.add(new DoubleField("latitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("latitude", v)); } if (values[5].isEmpty() == false) { double v = Double.parseDouble(values[5]); doc.add(new DoubleField("longitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("longitude", v)); } */ featureClassField.setStringValue(values[6]); featureCodeField.setStringValue(values[7]); countryCodeField.setStringValue(values[8]); cc2Field.setStringValue(values[9]); admin1Field.setStringValue(values[10]); admin2Field.setStringValue(values[11]); admin3Field.setStringValue(values[12]); admin4Field.setStringValue(values[13]); /* if (values[14].isEmpty() == false) { long v = Long.parseLong(values[14]); doc.add(new LongField("population", v, longFieldType)); doc.add(new NumericDocValuesField("population", v)); } if (values[15].isEmpty() == false) { long v = Long.parseLong(values[15]); doc.add(new LongField("elevation", v, longFieldType)); doc.add(new NumericDocValuesField("elevation", v)); } if (values[16].isEmpty() == false) { doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType)); } */ tzField.setStringValue(values[17]); /* if (values[18].isEmpty() == false) { datePos.setIndex(0); Date date = dateParser.parse(values[18], datePos); doc.add(new LongField("modified", date.getTime(), longFieldType)); } */ w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } catch (Exception e) { throw new RuntimeException(e); } } } else { while (true) { try { // Curiously BufferedReader.readLine seems to be thread-safe... String line = reader.readLine(); if (line == null) { break; } String[] values = line.split("\t"); Document doc = new Document(); doc.add(new IntField("geoNameID", Integer.parseInt(values[0]), intFieldType)); doc.add(new TextField("name", values[1], store)); doc.add(new TextField("asciiName", values[2], store)); doc.add(new TextField("alternateNames", values[3], store)); if (values[4].isEmpty() == false) { double v = Double.parseDouble(values[4]); doc.add(new DoubleField("latitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("latitude", v)); } if (values[5].isEmpty() == false) { double v = Double.parseDouble(values[5]); doc.add(new DoubleField("longitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("longitude", v)); } doc.add(new StringField("featureClass", values[6], store)); doc.add(new StringField("featureCode", values[7], store)); doc.add(new StringField("countryCode", values[8], store)); doc.add(new StringField("cc2", values[9], store)); doc.add(new StringField("admin1Code", values[10], store)); doc.add(new StringField("admin2Code", values[11], store)); doc.add(new StringField("admin3Code", values[12], store)); doc.add(new StringField("admin4Code", values[13], store)); if (values[14].isEmpty() == false) { long v = Long.parseLong(values[14]); doc.add(new LongField("population", v, longFieldType)); doc.add(new NumericDocValuesField("population", v)); } if (values[15].isEmpty() == false) { long v = Long.parseLong(values[15]); doc.add(new LongField("elevation", v, longFieldType)); doc.add(new NumericDocValuesField("elevation", v)); } if (values[16].isEmpty() == false) { doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType)); } doc.add(new StringField("timezone", values[17], store)); if (values[18].isEmpty() == false) { datePos.setIndex(0); Date date = dateParser.parse(values[18], datePos); doc.add(new LongField("modified", date.getTime(), longFieldType)); } w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } catch (Exception e) { throw new RuntimeException(e); } } } } }; threads[i].start(); } DirectoryReader r = DirectoryReader.open(w, true); for (int i = 0; i < 100; i++) { DirectoryReader r2 = DirectoryReader.openIfChanged(r); if (r2 != null) { r.close(); r = r2; } Thread.sleep(500); } if (r != null) { r.close(); r = null; } for (int i = 0; i < numThreads; i++) { threads[i].join(); } long ms = System.currentTimeMillis(); System.out.println(docsIndexed + ": " + ((ms - startMS) / 1000.0) + " sec"); //System.out.println("tot conflicts: " + BytesRefHash.totConflict); //w.shutdown(normal); w.close(); dir.close(); }
From source file:perf.IndexNumbers.java
License:Apache License
public static void main(String[] args) throws Exception { String numbersFile = args[0]; File indexPath = new File(args[1]); int numThreads = Integer.parseInt(args[2]); int precStep = Integer.parseInt(args[3]); if (indexPath.exists()) { throw new IllegalArgumentException("please remove indexPath \"" + indexPath + "\" before running"); }/* w w w . j av a 2 s .co m*/ Directory dir = FSDirectory.open(indexPath); IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); //iwc.setRAMBufferSizeMB(350); //iwc.setInfoStream(new PrintStreamInfoStream(System.out)); if (normal == false) { iwc.setRAMBufferSizeMB(64); iwc.setMergePolicy(NoMergePolicy.INSTANCE); } else { // 5/5 segments: // 3273681 docs in twitter timestamps iwc.setMaxBufferedDocs(59522); iwc.setRAMBufferSizeMB(-1); } //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).setMaxMergesAndThreads(3, 1); final IndexWriter w = new IndexWriter(dir, iwc); final Field.Store store = Field.Store.NO; final FieldType doubleFieldType = new FieldType( store == Field.Store.NO ? DoubleField.TYPE_NOT_STORED : DoubleField.TYPE_STORED); doubleFieldType.setNumericPrecisionStep(precStep); doubleFieldType.freeze(); final FieldType longFieldType = new FieldType( store == Field.Store.NO ? LongField.TYPE_NOT_STORED : LongField.TYPE_STORED); longFieldType.setNumericPrecisionStep(precStep); longFieldType.freeze(); final FieldType intFieldType = new FieldType( store == Field.Store.NO ? IntField.TYPE_NOT_STORED : IntField.TYPE_STORED); intFieldType.setNumericPrecisionStep(precStep); intFieldType.freeze(); // 64K buffer: InputStream is = new FileInputStream(numbersFile); final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); final AtomicInteger docsIndexed = new AtomicInteger(); final long startMS = System.currentTimeMillis(); Thread[] threads = new Thread[numThreads]; for (int i = 0; i < numThreads; i++) { threads[i] = new Thread() { @Override public void run() { ParsePosition datePos = new ParsePosition(0); SimpleDateFormat dateParser = new SimpleDateFormat("yyyy-MM-dd", Locale.US); Document doc = new Document(); Field field = new LongField("number", 0L, longFieldType); doc.add(field); while (true) { try { // Curiously BufferedReader.readLine seems to be thread-safe... String line = reader.readLine(); if (line == null) { break; } field.setLongValue(Long.parseLong(line.trim())); w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } catch (Exception e) { throw new RuntimeException(e); } } } }; threads[i].start(); } for (int i = 0; i < numThreads; i++) { threads[i].join(); } long ms = System.currentTimeMillis(); System.out.println(docsIndexed + ": " + ((ms - startMS) / 1000.0) + " sec"); //System.out.println("tot conflicts: " + BytesRefHash.totConflict); if (normal == false) { w.abortMerges(); } w.close(); dir.close(); }
From source file:perf.NRTPerfTest.java
License:Apache License
public static void main(String[] args) throws Exception { final String dirImpl = args[0]; final String dirPath = args[1]; final String commit = args[2]; final String lineDocFile = args[3]; final long seed = Long.parseLong(args[4]); final double docsPerSec = Double.parseDouble(args[5]); final double runTimeSec = Double.parseDouble(args[6]); final int numSearchThreads = Integer.parseInt(args[7]); int numIndexThreads = Integer.parseInt(args[8]); if (numIndexThreads > docsPerSec) { System.out.println("INFO: numIndexThreads higher than docsPerSec, adjusting numIndexThreads"); numIndexThreads = (int) Math.max(1, docsPerSec); }/*from www.j a v a 2 s . co m*/ final double reopenPerSec = Double.parseDouble(args[9]); final Mode mode = Mode.valueOf(args[10].toUpperCase(Locale.ROOT)); statsEverySec = Integer.parseInt(args[11]); final boolean doCommit = args[12].equals("yes"); final double mergeMaxWriteMBPerSec = Double.parseDouble(args[13]); if (mergeMaxWriteMBPerSec != 0.0) { throw new IllegalArgumentException("mergeMaxWriteMBPerSec must be 0.0 until LUCENE-3202 is done"); } final String tasksFile = args[14]; if (Files.notExists(Paths.get(tasksFile))) { throw new FileNotFoundException("tasks file not found " + tasksFile); } final boolean hasProcMemInfo = Files.exists(Paths.get("/proc/meminfo")); System.out.println("DIR=" + dirImpl); System.out.println("Index=" + dirPath); System.out.println("Commit=" + commit); System.out.println("LineDocs=" + lineDocFile); System.out.println("Docs/sec=" + docsPerSec); System.out.println("Run time sec=" + runTimeSec); System.out.println("NumSearchThreads=" + numSearchThreads); System.out.println("NumIndexThreads=" + numIndexThreads); System.out.println("Reopen/sec=" + reopenPerSec); System.out.println("Mode=" + mode); System.out.println("tasksFile=" + tasksFile); System.out.println("Record stats every " + statsEverySec + " seconds"); final int count = (int) ((runTimeSec / statsEverySec) + 2); docsIndexedByTime = new AtomicInteger[count]; searchesByTime = new AtomicInteger[count]; totalUpdateTimeByTime = new AtomicLong[count]; final AtomicInteger reopensByTime[] = new AtomicInteger[count]; for (int i = 0; i < count; i++) { docsIndexedByTime[i] = new AtomicInteger(); searchesByTime[i] = new AtomicInteger(); totalUpdateTimeByTime[i] = new AtomicLong(); reopensByTime[i] = new AtomicInteger(); } System.out.println( "Max merge MB/sec = " + (mergeMaxWriteMBPerSec <= 0.0 ? "unlimited" : mergeMaxWriteMBPerSec)); final Random random = new Random(seed); final LineFileDocs docs = new LineFileDocs(lineDocFile, true, false, false, false, false, null, new HashSet<String>(), null, true); final Directory dir0; if (dirImpl.equals("MMapDirectory")) { dir0 = new MMapDirectory(Paths.get(dirPath)); } else if (dirImpl.equals("NIOFSDirectory")) { dir0 = new NIOFSDirectory(Paths.get(dirPath)); } else if (dirImpl.equals("SimpleFSDirectory")) { dir0 = new SimpleFSDirectory(Paths.get(dirPath)); } else { docs.close(); throw new RuntimeException("unknown directory impl \"" + dirImpl + "\""); } //final NRTCachingDirectory dir = new NRTCachingDirectory(dir0, 10, 200.0, mergeMaxWriteMBPerSec); final NRTCachingDirectory dir = new NRTCachingDirectory(dir0, 20, 400.0); //final MergeScheduler ms = dir.getMergeScheduler(); //final Directory dir = dir0; //final MergeScheduler ms = new ConcurrentMergeScheduler(); final String field = "body"; // Open an IW on the requested commit point, but, don't // delete other (past or future) commit points: // TODO take Analyzer as parameter StandardAnalyzer analyzer = new StandardAnalyzer(CharArraySet.EMPTY_SET); final IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); conf.setRAMBufferSizeMB(256.0); //iwc.setMergeScheduler(ms); final Codec codec = new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { if (field.equals("id")) { return PostingsFormat.forName("Memory"); } else { return PostingsFormat.forName("Lucene50"); } } private final DocValuesFormat direct = DocValuesFormat.forName("Direct"); @Override public DocValuesFormat getDocValuesFormatForField(String field) { return direct; } }; conf.setCodec(codec); /* iwc.setMergePolicy(new LogByteSizeMergePolicy()); ((LogMergePolicy) iwc.getMergePolicy()).setUseCompoundFile(false); ((LogMergePolicy) iwc.getMergePolicy()).setMergeFactor(30); ((LogByteSizeMergePolicy) iwc.getMergePolicy()).setMaxMergeMB(10000.0); System.out.println("USING LOG BS MP"); */ TieredMergePolicy tmp = new TieredMergePolicy(); tmp.setNoCFSRatio(0.0); tmp.setMaxMergedSegmentMB(1000000.0); //tmp.setReclaimDeletesWeight(3.0); //tmp.setMaxMergedSegmentMB(7000.0); conf.setMergePolicy(tmp); if (!commit.equals("none")) { conf.setIndexCommit(PerfUtils.findCommitPoint(commit, dir)); } // Make sure merges run @ higher prio than indexing: final ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) conf.getMergeScheduler(); cms.setMaxMergesAndThreads(4, 1); conf.setMergedSegmentWarmer(new MergedReaderWarmer(field)); final IndexWriter w = new IndexWriter(dir, conf); // w.setInfoStream(System.out); IndexThreads.UpdatesListener updatesListener = new IndexThreads.UpdatesListener() { long startTimeNS; @Override public void beforeUpdate() { startTimeNS = System.nanoTime(); } @Override public void afterUpdate() { int idx = currentQT.get(); totalUpdateTimeByTime[idx].addAndGet(System.nanoTime() - startTimeNS); docsIndexedByTime[idx].incrementAndGet(); } }; IndexThreads indexThreads = new IndexThreads(random, w, new AtomicBoolean(false), docs, numIndexThreads, -1, false, false, mode, (float) (docsPerSec / numIndexThreads), updatesListener, -1.0, w.maxDoc()); // NativePosixUtil.mlockTermsDict(startR, "id"); final SearcherManager manager = new SearcherManager(w, null); IndexSearcher s = manager.acquire(); try { System.out.println("Reader=" + s.getIndexReader()); } finally { manager.release(s); } final DirectSpellChecker spellChecker = new DirectSpellChecker(); final IndexState indexState = new IndexState(manager, null, field, spellChecker, "PostingsHighlighter", null); final QueryParser qp = new QueryParser(field, analyzer); TaskParser taskParser = new TaskParser(indexState, qp, field, 10, random, true); final TaskSource tasks = new RandomTaskSource(taskParser, tasksFile, random) { @Override public void taskDone(Task task, long queueTimeNS, int toalHitCount) { searchesByTime[currentQT.get()].incrementAndGet(); } }; System.out.println("Task repeat count 1"); System.out.println("Tasks file " + tasksFile); System.out.println("Num task per cat 20"); final TaskThreads taskThreads = new TaskThreads(tasks, indexState, numSearchThreads); final ReopenThread reopenThread = new ReopenThread(reopenPerSec, manager, reopensByTime, runTimeSec); reopenThread.setName("ReopenThread"); reopenThread.setPriority(4 + Thread.currentThread().getPriority()); System.out.println("REOPEN PRI " + reopenThread.getPriority()); indexThreads.start(); reopenThread.start(); taskThreads.start(); Thread.currentThread().setPriority(5 + Thread.currentThread().getPriority()); System.out.println("TIMER PRI " + Thread.currentThread().getPriority()); //System.out.println("Start: " + new Date()); final long startMS = System.currentTimeMillis(); final long stopMS = startMS + (long) (runTimeSec * 1000); int lastQT = -1; while (true) { final long t = System.currentTimeMillis(); if (t >= stopMS) { break; } final int qt = (int) ((t - startMS) / statsEverySec / 1000); currentQT.set(qt); if (qt != lastQT) { final int prevQT = lastQT; lastQT = qt; if (prevQT > 0) { final String other; if (hasProcMemInfo) { other = " D=" + getLinuxDirtyBytes(); } else { other = ""; } int prev = prevQT - 1; System.out.println(String.format("QT %d searches=%d docs=%d reopens=%s totUpdateTime=%d", prev, searchesByTime[prev].get(), docsIndexedByTime[prev].get(), reopensByTime[prev].get() + other, TimeUnit.NANOSECONDS.toMillis(totalUpdateTimeByTime[prev].get()))); } } Thread.sleep(25); } taskThreads.stop(); reopenThread.join(); indexThreads.stop(); System.out.println("By time:"); for (int i = 0; i < searchesByTime.length - 2; i++) { System.out.println(String.format(" %d searches=%d docs=%d reopens=%d totUpdateTime=%d", i * statsEverySec, searchesByTime[i].get(), docsIndexedByTime[i].get(), reopensByTime[i].get(), TimeUnit.NANOSECONDS.toMillis(totalUpdateTimeByTime[i].get()))); } manager.close(); if (doCommit) { w.close(); } else { w.rollback(); } }
From source file:perf.PKLookupPerfTest3X.java
License:Apache License
private static void createIndex(final Directory dir, final int docCount) throws IOException { System.out.println("Create index... " + docCount + " docs"); final IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, new WhitespaceAnalyzer(Version.LUCENE_35)); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // 5 segs per level in 3 levels: int mbd = docCount / (5 * 111); iwc.setMaxBufferedDocs(mbd);// w w w. jav a 2 s . com iwc.setRAMBufferSizeMB(-1.0); ((TieredMergePolicy) iwc.getMergePolicy()).setUseCompoundFile(false); final IndexWriter w = new IndexWriter(dir, iwc); //w.setInfoStream(System.out); final Document doc = new Document(); final Field field = new Field("id", "", Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); field.setIndexOptions(FieldInfo.IndexOptions.DOCS_ONLY); doc.add(field); for (int i = 0; i < docCount; i++) { field.setValue(String.format("%09d", i)); w.addDocument(doc); if ((i + 1) % 1000000 == 0) { System.out.println((i + 1) + "..."); } } w.waitForMerges(); w.close(); }
From source file:perf.SearchPerfTest.java
License:Apache License
private static void _main(String[] clArgs) throws Exception { // args: dirImpl indexPath numThread numIterPerThread // eg java SearchPerfTest /path/to/index 4 100 final Args args = new Args(clArgs); Directory dir0;/*from w ww . j av a2s. c o m*/ final String dirPath = args.getString("-indexPath") + "/index"; final String dirImpl = args.getString("-dirImpl"); OpenDirectory od = OpenDirectory.get(dirImpl); /* } else if (dirImpl.equals("NativePosixMMapDirectory")) { dir0 = new NativePosixMMapDirectory(new File(dirPath)); ramDir = null; if (doFacets) { facetsDir = new NativePosixMMapDirectory(new File(facetsDirPath)); } } else if (dirImpl.equals("CachingDirWrapper")) { dir0 = new CachingRAMDirectory(new MMapDirectory(new File(dirPath))); ramDir = null; } else if (dirImpl.equals("RAMExceptDirectPostingsDirectory")) { // Load only non-postings files into RAMDir (assumes // Lucene40PF is the wrapped PF): Set<String> postingsExtensions = new HashSet<String>(); postingsExtensions.add("frq"); postingsExtensions.add("prx"); postingsExtensions.add("tip"); postingsExtensions.add("tim"); ramDir = new RAMDirectory(); Directory fsDir = new MMapDirectory(new File(dirPath)); for (String file : fsDir.listAll()) { int idx = file.indexOf('.'); if (idx != -1 && postingsExtensions.contains(file.substring(idx+1, file.length()))) { continue; } fsDir.copy(ramDir, file, file, IOContext.READ); } dir0 = new FileSwitchDirectory(postingsExtensions, fsDir, ramDir, true); if (doFacets) { facetsDir = new RAMDirectory(new SimpleFSDirectory(new File(facetsDirPath)), IOContext.READ); } */ final RAMDirectory ramDir; dir0 = od.open(Paths.get(dirPath)); if (dir0 instanceof RAMDirectory) { ramDir = (RAMDirectory) dir0; } else { ramDir = null; } // TODO: NativeUnixDir? final String analyzer = args.getString("-analyzer"); final String tasksFile = args.getString("-taskSource"); final int searchThreadCount = args.getInt("-searchThreadCount"); final String fieldName = args.getString("-field"); final boolean printHeap = args.getFlag("-printHeap"); final boolean doPKLookup = args.getFlag("-pk"); final int topN = args.getInt("-topN"); final boolean doStoredLoads = args.getFlag("-loadStoredFields"); // Used to choose which random subset of tasks we will // run, to generate the PKLookup tasks, and to generate // any random pct filters: final long staticRandomSeed = args.getLong("-staticSeed"); // Used to shuffle the random subset of tasks: final long randomSeed = args.getLong("-seed"); // TODO: this could be way better. final String similarity = args.getString("-similarity"); // now reflect final Class<? extends Similarity> simClazz = Class .forName("org.apache.lucene.search.similarities." + similarity).asSubclass(Similarity.class); final Similarity sim = simClazz.newInstance(); System.out.println("Using dir impl " + dir0.getClass().getName()); System.out.println("Analyzer " + analyzer); System.out.println("Similarity " + similarity); System.out.println("Search thread count " + searchThreadCount); System.out.println("topN " + topN); System.out.println("JVM " + (Constants.JRE_IS_64BIT ? "is" : "is not") + " 64bit"); System.out.println("Pointer is " + RamUsageEstimator.NUM_BYTES_OBJECT_REF + " bytes"); final Analyzer a; if (analyzer.equals("EnglishAnalyzer")) { a = new EnglishAnalyzer(); } else if (analyzer.equals("ClassicAnalyzer")) { a = new ClassicAnalyzer(); } else if (analyzer.equals("StandardAnalyzer")) { a = new StandardAnalyzer(); } else if (analyzer.equals("StandardAnalyzerNoStopWords")) { a = new StandardAnalyzer(CharArraySet.EMPTY_SET); } else if (analyzer.equals("ShingleStandardAnalyzer")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, true, ShingleFilter.DEFAULT_FILLER_TOKEN); } else { throw new RuntimeException("unknown analyzer " + analyzer); } final ReferenceManager<IndexSearcher> mgr; final IndexWriter writer; final Directory dir; final String commit = args.getString("-commit"); final String hiliteImpl = args.getString("-hiliteImpl"); final String logFile = args.getString("-log"); final long tSearcherStart = System.currentTimeMillis(); final boolean verifyCheckSum = !args.getFlag("-skipVerifyChecksum"); final boolean recacheFilterDeletes = args.getFlag("-recacheFilterDeletes"); if (recacheFilterDeletes) { throw new UnsupportedOperationException("recacheFilterDeletes was deprecated"); } if (args.getFlag("-nrt")) { // TODO: get taxoReader working here too // TODO: factor out & share this CL processing w/ Indexer final int indexThreadCount = args.getInt("-indexThreadCount"); final String lineDocsFile = args.getString("-lineDocsFile"); final float docsPerSecPerThread = args.getFloat("-docsPerSecPerThread"); final float reopenEverySec = args.getFloat("-reopenEverySec"); final boolean storeBody = args.getFlag("-store"); final boolean tvsBody = args.getFlag("-tvs"); final boolean useCFS = args.getFlag("-cfs"); final String defaultPostingsFormat = args.getString("-postingsFormat"); final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat"); final boolean verbose = args.getFlag("-verbose"); final boolean cloneDocs = args.getFlag("-cloneDocs"); final Mode mode = Mode.valueOf(args.getString("-mode", "update").toUpperCase(Locale.ROOT)); final long reopenEveryMS = (long) (1000 * reopenEverySec); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } if (!dirImpl.equals("RAMDirectory") && !dirImpl.equals("RAMExceptDirectPostingsDirectory")) { System.out.println("Wrap NRTCachingDirectory"); dir0 = new NRTCachingDirectory(dir0, 20, 400.0); } dir = dir0; final IndexWriterConfig iwc = new IndexWriterConfig(a); iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); iwc.setRAMBufferSizeMB(256.0); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); // TODO: also RAMDirExceptDirect...? need to // ... block deletes against wrapped FSDir? if (dirImpl.equals("RAMDirectory")) { // Let IW remove files only referenced by starting commit: iwc.setIndexDeletionPolicy(new KeepNoCommitsDeletionPolicy()); } if (commit != null && commit.length() > 0) { System.out.println("Opening writer on commit=" + commit); iwc.setIndexCommit(PerfUtils.findCommitPoint(commit, dir)); } ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(useCFS ? 1.0 : 0.0); //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(1024); //((TieredMergePolicy) iwc.getMergePolicy()).setReclaimDeletesWeight(3.0); //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergeAtOnce(4); final Codec codec = new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return PostingsFormat .forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat); } }; iwc.setCodec(codec); final ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler(); // Only let one merge run at a time... // ... but queue up up to 4, before index thread is stalled: cms.setMaxMergesAndThreads(4, 1); iwc.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() { @Override public void warm(LeafReader reader) throws IOException { final long t0 = System.currentTimeMillis(); //System.out.println("DO WARM: " + reader); IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.search(new TermQuery(new Term(fieldName, "united")), 10); final long t1 = System.currentTimeMillis(); System.out.println("warm segment=" + reader + " numDocs=" + reader.numDocs() + ": took " + (t1 - t0) + " msec"); } }); writer = new IndexWriter(dir, iwc); System.out.println("Initial writer.maxDoc()=" + writer.maxDoc()); // TODO: add -nrtBodyPostingsOffsets instead of // hardwired false: boolean addDVFields = mode == Mode.BDV_UPDATE || mode == Mode.NDV_UPDATE; LineFileDocs lineFileDocs = new LineFileDocs(lineDocsFile, false, storeBody, tvsBody, false, cloneDocs, null, null, null, addDVFields); IndexThreads threads = new IndexThreads(new Random(17), writer, new AtomicBoolean(false), lineFileDocs, indexThreadCount, -1, false, false, mode, docsPerSecPerThread, null, -1.0, -1); threads.start(); mgr = new SearcherManager(writer, new SearcherFactory() { @Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previous) { IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.setSimilarity(sim); return s; } }); System.out.println("reopen every " + reopenEverySec); Thread reopenThread = new Thread() { @Override public void run() { try { final long startMS = System.currentTimeMillis(); int reopenCount = 1; while (true) { final long sleepMS = startMS + (reopenCount * reopenEveryMS) - System.currentTimeMillis(); if (sleepMS < 0) { System.out.println("WARNING: reopen fell behind by " + Math.abs(sleepMS) + " ms"); } else { Thread.sleep(sleepMS); } Thread.sleep(sleepMS); mgr.maybeRefresh(); reopenCount++; IndexSearcher s = mgr.acquire(); try { if (ramDir != null) { System.out.println(String.format(Locale.ENGLISH, "%.1fs: index: %d bytes in RAMDir; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d", (System.currentTimeMillis() - startMS) / 1000.0, ramDir.ramBytesUsed(), writer.maxDoc(), s.getIndexReader().maxDoc(), s.getIndexReader().numDocs())); //String[] l = ramDir.listAll(); //Arrays.sort(l); //for(String f : l) { //System.out.println(" " + f + ": " + ramDir.fileLength(f)); //} } else { System.out.println(String.format(Locale.ENGLISH, "%.1fs: done reopen; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d", (System.currentTimeMillis() - startMS) / 1000.0, writer.maxDoc(), s.getIndexReader().maxDoc(), s.getIndexReader().numDocs())); } } finally { mgr.release(s); } } } catch (Exception e) { throw new RuntimeException(e); } } }; reopenThread.setName("ReopenThread"); reopenThread.setPriority(4 + Thread.currentThread().getPriority()); reopenThread.start(); } else { dir = dir0; writer = null; final DirectoryReader reader; if (commit != null && commit.length() > 0) { System.out.println("Opening searcher on commit=" + commit); reader = DirectoryReader.open(PerfUtils.findCommitPoint(commit, dir)); } else { // open last commit reader = DirectoryReader.open(dir); } IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.setSimilarity(sim); System.out.println("maxDoc=" + reader.maxDoc() + " numDocs=" + reader.numDocs() + " %tg deletes=" + (100. * reader.maxDoc() / reader.numDocs())); mgr = new SingleIndexSearcher(s); } System.out.println((System.currentTimeMillis() - tSearcherStart) + " msec to init searcher/NRT"); { IndexSearcher s = mgr.acquire(); try { System.out.println("Searcher: numDocs=" + s.getIndexReader().numDocs() + " maxDoc=" + s.getIndexReader().maxDoc() + ": " + s); } finally { mgr.release(s); } } //System.out.println("searcher=" + searcher); FacetsConfig facetsConfig = new FacetsConfig(); facetsConfig.setHierarchical("Date", true); TaxonomyReader taxoReader; Path taxoPath = Paths.get(args.getString("-indexPath"), "facets"); Directory taxoDir = od.open(taxoPath); if (DirectoryReader.indexExists(taxoDir)) { taxoReader = new DirectoryTaxonomyReader(taxoDir); System.out.println("Taxonomy has " + taxoReader.getSize() + " ords"); } else { taxoReader = null; } final Random staticRandom = new Random(staticRandomSeed); final Random random = new Random(randomSeed); final DirectSpellChecker spellChecker = new DirectSpellChecker(); final IndexState indexState = new IndexState(mgr, taxoReader, fieldName, spellChecker, hiliteImpl, facetsConfig); final QueryParser queryParser = new QueryParser("body", a); TaskParser taskParser = new TaskParser(indexState, queryParser, fieldName, topN, staticRandom, doStoredLoads); final TaskSource tasks; if (tasksFile.startsWith("server:")) { int idx = tasksFile.indexOf(':', 8); if (idx == -1) { throw new RuntimeException( "server is missing the port; should be server:interface:port (got: " + tasksFile + ")"); } String iface = tasksFile.substring(7, idx); int port = Integer.valueOf(tasksFile.substring(1 + idx)); RemoteTaskSource remoteTasks = new RemoteTaskSource(iface, port, searchThreadCount, taskParser); // nocommit must stop thread? tasks = remoteTasks; } else { // Load the tasks from a file: final int taskRepeatCount = args.getInt("-taskRepeatCount"); final int numTaskPerCat = args.getInt("-tasksPerCat"); tasks = new LocalTaskSource(indexState, taskParser, tasksFile, staticRandom, random, numTaskPerCat, taskRepeatCount, doPKLookup); System.out.println("Task repeat count " + taskRepeatCount); System.out.println("Tasks file " + tasksFile); System.out.println("Num task per cat " + numTaskPerCat); } args.check(); // Evil respeller: //spellChecker.setMinPrefix(0); //spellChecker.setMaxInspections(1024); final TaskThreads taskThreads = new TaskThreads(tasks, indexState, searchThreadCount); Thread.sleep(10); final long startNanos = System.nanoTime(); taskThreads.start(); taskThreads.finish(); final long endNanos = System.nanoTime(); System.out.println("\n" + ((endNanos - startNanos) / 1000000.0) + " msec total"); final List<Task> allTasks = tasks.getAllTasks(); PrintStream out = new PrintStream(logFile); if (allTasks != null) { // Tasks were local: verify checksums: // indexState.setDocIDToID(); final Map<Task, Task> tasksSeen = new HashMap<Task, Task>(); out.println("\nResults for " + allTasks.size() + " tasks:"); boolean fail = false; for (final Task task : allTasks) { if (verifyCheckSum) { final Task other = tasksSeen.get(task); if (other != null) { if (task.checksum() != other.checksum()) { System.out.println("\nTASK:"); task.printResults(System.out, indexState); System.out.println("\nOTHER TASK:"); other.printResults(System.out, indexState); fail = true; //throw new RuntimeException("task " + task + " hit different checksums: " + task.checksum() + " vs " + other.checksum() + " other=" + other); } } else { tasksSeen.put(task, task); } } out.println("\nTASK: " + task); out.println(" " + (task.runTimeNanos / 1000000.0) + " msec"); out.println(" thread " + task.threadID); task.printResults(out, indexState); } if (fail) { throw new RuntimeException("some tasks got different results across different threads"); } allTasks.clear(); } mgr.close(); if (taxoReader != null) { taxoReader.close(); } if (writer != null) { // Don't actually commit any index changes: writer.rollback(); } dir.close(); if (printHeap) { // Try to get RAM usage -- some ideas poached from http://www.javaworld.com/javaworld/javatips/jw-javatip130.html final Runtime runtime = Runtime.getRuntime(); long usedMem1 = PerfUtils.usedMemory(runtime); long usedMem2 = Long.MAX_VALUE; for (int iter = 0; iter < 10; iter++) { runtime.runFinalization(); runtime.gc(); Thread.yield(); Thread.sleep(100); usedMem2 = usedMem1; usedMem1 = PerfUtils.usedMemory(runtime); } out.println("\nHEAP: " + PerfUtils.usedMemory(runtime)); } out.close(); }
From source file:perf.TermsQueryPerf.java
License:Apache License
public static void main(String[] args) throws Exception { List<BytesRef> lookupIDs = new ArrayList<>(); Random random = new Random(17); double rate = 1.01 * ((double) NUM_QUERIES * ID_SEARCH_COUNT) / ID_INDEX_COUNT; Path indexPath = Paths.get(args[0]); boolean doIndex = Files.exists(indexPath) == false; Directory dir = FSDirectory.open(indexPath); if (doIndex) { IndexWriterConfig iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setMergeScheduler(new SerialMergeScheduler()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // So I can walk the files and get the *.tip sizes: iwc.setUseCompoundFile(false);//from w w w.j a v a 2s . c o m /// 7/7/7 segment structure: iwc.setMaxBufferedDocs(ID_INDEX_COUNT / 777); iwc.setRAMBufferSizeMB(-1); ((TieredMergePolicy) iwc.getMergePolicy()).setFloorSegmentMB(.001); ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(0.0); IndexWriter w = new IndexWriter(dir, iwc); // IDIterator ids = zeroPadSequentialIDs(10); IDIterator ids = randomIDs(10, random); BytesRef idValue = new BytesRef(64); for (int i = 0; i < ID_INDEX_COUNT; i++) { ids.next(idValue); Document doc = new Document(); doc.add(new StringField("id", idValue, Field.Store.NO)); w.addDocument(doc); if (random.nextDouble() <= rate && lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) { lookupIDs.add(BytesRef.deepCopyOf(idValue)); } if (i % 100000 == 0) { System.out.println(i + " docs..."); } } w.close(); } IndexReader r = DirectoryReader.open(dir); if (doIndex == false) { System.out.println("Build lookup ids"); TermsEnum termsEnum = MultiFields.getTerms(r, "id").iterator(); BytesRef idValue; while ((idValue = termsEnum.next()) != null) { if (random.nextDouble() <= rate && lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) { lookupIDs.add(BytesRef.deepCopyOf(idValue)); //System.out.println("add: " + idValue); } } shuffle(random, lookupIDs); System.out.println("Done build lookup ids"); } IndexSearcher s = new IndexSearcher(r); if (lookupIDs.size() < NUM_QUERIES * ID_SEARCH_COUNT) { throw new RuntimeException( "didn't get enough lookup ids: " + (NUM_QUERIES * ID_SEARCH_COUNT) + " vs " + lookupIDs.size()); } List<Query> queries = new ArrayList<Query>(); for (int i = 0; i < NUM_QUERIES; i++) { List<BytesRef> sortedTermBytes = new ArrayList<>(); for (BytesRef term : lookupIDs.subList(i * ID_SEARCH_COUNT, (i + 1) * ID_SEARCH_COUNT)) { sortedTermBytes.add(term); } Collections.sort(sortedTermBytes); // nocommit only do this if term count is high enough? // nocommit: we can be more efficient here, go straight to binary: Query query = new AutomatonQuery(new Term("id", "manyterms"), Automata.makeStringUnion(sortedTermBytes)); //((MultiTermQuery) query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_REWRITE); //Query query = new TermsQuery("id", lookupIDs.subList(i*ID_SEARCH_COUNT, (i+1)*ID_SEARCH_COUNT)); queries.add(query); } // TODO: also include construction time of queries long best = Long.MAX_VALUE; for (int iter = 0; iter < 100; iter++) { long t0 = System.nanoTime(); long totCount = 0; for (int i = 0; i < NUM_QUERIES; i++) { //Query query = new TermsQuery("id", lookupIDs.subList(i*ID_SEARCH_COUNT, (i+1)*ID_SEARCH_COUNT)); Query query = queries.get(i); totCount += s.search(query, 10).totalHits; } if (totCount != NUM_QUERIES * ID_SEARCH_COUNT) { throw new RuntimeException( "totCount=" + totCount + " but expected " + (NUM_QUERIES * ID_SEARCH_COUNT)); } long t = System.nanoTime() - t0; System.out.println("ITER: " + iter + ": " + (t / 1000000.) + " msec"); if (t < best) { System.out.println(" **"); best = t; } } IOUtils.close(r, dir); }
From source file:perf.TestBenchNRTPKLookup.java
License:Apache License
public static void main(String[] args) throws IOException { Directory dir = new MMapDirectory(new File(args[0])); //Directory dir = new NIOFSDirectory(new File(args[0])); IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); iwc.setRAMBufferSizeMB(250); IndexWriter writer = new IndexWriter(dir, iwc); final SearcherManager manager = new SearcherManager(writer, true, new SearcherFactory() { @Override// ww w . ja v a 2s .c om public IndexSearcher newSearcher(IndexReader r) { return new IndexSearcher(r); } }); FieldType type = new FieldType(); type.setIndexed(true); type.setTokenized(false); type.setStored(false); type.freeze(); HashMap<Object, TermsEnum> cachedTermsEnum = new HashMap<Object, TermsEnum>(); long time = System.currentTimeMillis(); long lastTime = time; int num = 2500000; Random r = new Random(16); for (int i = 0; i < num; i++) { //Term t = new Term("_id", Integer.toString(i)); String id = String.format("%010d", r.nextInt(Integer.MAX_VALUE)); Term t = new Term("_id", id); IndexSearcher acquire = manager.acquire(); try { IndexReader indexReader = acquire.getIndexReader(); List<AtomicReaderContext> leaves = indexReader.leaves(); for (AtomicReaderContext atomicReaderContext : leaves) { AtomicReader reader = atomicReaderContext.reader(); TermsEnum termsEnum = cachedTermsEnum.get(reader.getCombinedCoreAndDeletesKey()); if (termsEnum == null) { termsEnum = reader.fields().terms("_id").iterator(null); //cachedTermsEnum.put(reader.getCombinedCoreAndDeletesKey(), termsEnum); // uncomment this line to see improvements } // MKM //System.out.println("\nlookup seg=: " + reader + " term=" + t); if (termsEnum.seekExact(t.bytes())) { DocsEnum termDocsEnum = termsEnum.docs(reader.getLiveDocs(), null); if (termDocsEnum != null) { break; } } } } finally { manager.release(acquire); } Document d = new Document(); d.add(new Field("_id", id, type)); writer.updateDocument(t, d); //writer.addDocument(d); if (i % 50000 == 0) { long t1 = System.currentTimeMillis(); System.out.println(i + " " + (t1 - lastTime) + " ms"); lastTime = t1; } if ((i + 1) % 250000 == 0) { System.out.println("Reopen..."); manager.maybeRefresh(); IndexSearcher s = manager.acquire(); try { System.out.println(" got: " + s); } finally { manager.release(s); } } } System.out.println("\nTotal: " + (System.currentTimeMillis() - time) + " msec"); //System.out.println("loadBlockCount: " + BlockTreeTermsReader.loadBlockCount); manager.close(); writer.close(); dir.close(); }
From source file:proj.zoie.impl.indexing.internal.DiskSearchIndex.java
License:Apache License
/** * Opens an index modifier./*w w w. jav a 2s. c o m*/ * @param analyzer Analyzer * @return IndexModifer instance */ public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException { if (_indexWriter != null) return _indexWriter; Directory directory = _dirMgr.getDirectory(true); log.info("opening index writer at: " + _dirMgr.getPath()); ZoieMergePolicy mergePolicy = new ZoieMergePolicy(); mergePolicy.setMergePolicyParams(_mergePolicyParams); // hao: autocommit is set to false with this constructor IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_34, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setIndexDeletionPolicy(_deletionPolicy); config.setMergeScheduler(_mergeScheduler); config.setMergePolicy(mergePolicy); config.setReaderPooling(false); if (similarity != null) { config.setSimilarity(similarity); } config.setRAMBufferSizeMB(5); IndexWriter idxWriter = new IndexWriter(directory, config); _indexWriter = idxWriter; return idxWriter; }
From source file:proj.zoie.impl.indexing.internal.RAMSearchIndex.java
License:Apache License
public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException { if (_indexWriter != null) return _indexWriter; ZoieMergePolicy mergePolicy = new ZoieMergePolicy(); mergePolicy.setMergePolicyParams(_mergePolicyParams); mergePolicy.setUseCompoundFile(false); IndexWriterConfig config = indexWriterConfigStorage.get(); if (config == null) { config = new IndexWriterConfig(Version.LUCENE_34, analyzer); indexWriterConfigStorage.set(config); }// w w w.j a va2s .c o m config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setMergeScheduler(_mergeScheduler); config.setMergePolicy(mergePolicy); config.setReaderPooling(false); if (similarity != null) { config.setSimilarity(similarity); } config.setRAMBufferSizeMB(3); IndexWriter idxWriter = new IndexWriter(_directory, config); _indexWriter = idxWriter; return idxWriter; }
From source file:semanticRelatedness.MakeLuceneIndex.java
License:Apache License
/** Index all text files under a directory. * @throws UnsupportedEncodingException * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { String baseDir = "/home/chrisschaefer/"; //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2"; String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2"; String luceneIndexName = "enwiki-20130604-lucene2"; System.currentTimeMillis();/*w w w . ja v a2 s.c o m*/ boolean bIgnoreStubs = false; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-luceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; if (args[i].equals("-dumpfile")) wikiDumpFile = args[++i]; if (args[i].equals("-includestubs")) bIgnoreStubs = true; } String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt"; String logPath = baseDir + luceneIndexName + ".log"; PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8"); PrintWriter logger = new PrintWriter(logPath, "UTF-8"); logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); Date start = new Date(); try { Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName)); // Analyzer analyzer = new WikipediaAnalyzer(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); iwc.setSimilarity(new DefaultSimilarity()); //iwc.setSimilarity(new ESASimilarity()); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // iwc.setRAMBufferSizeMB(2000.0); IndexWriter writer = new IndexWriter(dir, iwc); Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile); wikidumpExtractor.setLinkSeparator("_"); wikidumpExtractor.setCategorySeparator("_"); wikidumpExtractor.setTitleSeparator(" "); int iStubs = 0; int iArticleCount = 0; int iSkippedPageCount = 0; long iStartTime = java.lang.System.nanoTime(); long iTime = iStartTime; while (wikidumpExtractor.nextPage()) { if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) { ++iSkippedPageCount; continue; } if (bIgnoreStubs && wikidumpExtractor.getStub()) { ++iStubs; continue; } // skip pages with less than 5 out links if (wikidumpExtractor.getPageLinkList(true).size() < 5) { ++iSkippedPageCount; continue; } if (wikidumpExtractor.getPageCategories().equals("")) { ++iSkippedPageCount; logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false)); continue; } else { for (String link : wikidumpExtractor.getPageLinkList(false)) { // artikelTextWriter.println(link); if (_inLinks.containsKey(link)) { int tmp = _inLinks.get(link); tmp++; _inLinks.put(link, tmp); } else { _inLinks.put(link, 1); } } } if (wikidumpExtractor.getPageText().equals("")) { ++iSkippedPageCount; continue; } artikelTextWriter.println( wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false)); ++iArticleCount; if (iArticleCount % 1000 == 0) { logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } artikelTextWriter.close(); iArticleCount = 0; PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8"); BufferedReader br = new BufferedReader(new FileReader(rawTextPath)); String line = br.readLine(); while (line != null) { int endOfTitle = line.indexOf("\t"); String title = line.substring(0, endOfTitle); if (_inLinks.containsKey(title)) { int inlinks = _inLinks.get(title); artikelInLinkWriter.println(title + "\t" + inlinks); if (inlinks > 4) { //System.out.println("inlinks > 0 "); Document doc = new Document(); ++iArticleCount; // wikidumpExtractor.setTitleSeparator( "_" ); // doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES)); doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " " + line.substring(endOfTitle + 1), Field.Store.NO)); // System.out.println(title + " " + // title + " " + // title + " " + // title + " " + // line.substring(endOfTitle+1)); writer.addDocument(doc); if (iArticleCount % 1000 == 0) { writer.commit(); logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } } else { artikelInLinkWriter.println(title + "\t0"); } line = br.readLine(); } br.close(); artikelInLinkWriter.close(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.commit(); writer.forceMerge(1); writer.close(); Date end = new Date(); String endStatement = end.getTime() - start.getTime() + " total milliseconds (" + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles."; logger.println(endStatement); System.out.println(endStatement); logger.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }