List of usage examples for org.apache.lucene.util PrintStreamInfoStream PrintStreamInfoStream
public PrintStreamInfoStream(PrintStream stream)
From source file:IndexAndSearchOpenStreetMaps1D.java
License:Apache License
private static void createIndex() throws IOException { long t0 = System.nanoTime(); CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); int BUFFER_SIZE = 1 << 16; // 64K InputStream is = Files .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt")); BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : ""))); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //iwc.setMaxBufferedDocs(109630); //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH); iwc.setRAMBufferSizeMB(256.0);/*from w ww.jav a2 s . co m*/ iwc.setMergePolicy(new LogDocMergePolicy()); iwc.setMergeScheduler(new SerialMergeScheduler()); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); IndexWriter w = new IndexWriter(dir, iwc); int count = 0; byte[] scratch = new byte[4]; while (true) { String line = reader.readLine(); if (line == null) { break; } String[] parts = line.split(","); //long id = Long.parseLong(parts[0]); int lat = (int) (1000000. * Double.parseDouble(parts[1])); //int lon = (int) (1000000. * Double.parseDouble(parts[2])); Document doc = new Document(); if (USE_NF) { doc.add(new LegacyIntField("latnum", lat, Field.Store.NO)); //doc.add(new LongField("lonnum", lon, Field.Store.NO)); } else { doc.add(new IntPoint("lat", lat)); //doc.add(new SortedNumericDocValuesField("lon", lon)); } w.addDocument(doc); count++; if (count % 1000000 == 0) { System.out.println(count + "..."); } } //w.forceMerge(1); w.commit(); System.out.println(w.maxDoc() + " total docs"); w.close(); long t1 = System.nanoTime(); System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index"); }
From source file:IndexTaxis.java
License:Apache License
public static void main(String[] args) throws Exception { Path indexPath = Paths.get(args[0]); Directory dir = FSDirectory.open(indexPath); int threadCount = Integer.parseInt(args[1]); Path docsPath = Paths.get(args[2]); IndexWriterConfig iwc = new IndexWriterConfig(); //System.out.println("NOW SET INFO STREAM"); iwc.setRAMBufferSizeMB(1024.);// w ww . j a va 2 s. c om iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).disableAutoIOThrottle(); final IndexWriter w = new IndexWriter(dir, iwc); BufferedInputStream docs = new BufferedInputStream(Files.newInputStream(docsPath, StandardOpenOption.READ)); // parse the header fields List<String> fieldsList = new ArrayList<>(); StringBuilder builder = new StringBuilder(); while (true) { int x = docs.read(); if (x == -1) { throw new IllegalArgumentException( "hit EOF while trying to read CSV header; are you sure you have the right CSV file!"); } byte b = (byte) x; if (b == NEWLINE) { fieldsList.add(builder.toString()); break; } else if (b == COMMA) { fieldsList.add(builder.toString()); builder.setLength(0); } else { // this is OK because headers are all ascii: builder.append((char) b); } } final String[] fields = fieldsList.toArray(new String[fieldsList.size()]); Thread[] threads = new Thread[threadCount]; final AtomicInteger docCounter = new AtomicInteger(); final AtomicLong bytesCounter = new AtomicLong(); startNS = System.nanoTime(); for (int i = 0; i < threadCount; i++) { final int threadID = i; threads[i] = new Thread() { @Override public void run() { try { _run(); } catch (Exception e) { throw new RuntimeException(e); } } private void _run() throws IOException { while (true) { Chunk chunk = readChunk(docs); if (chunk == null) { break; } indexOneChunk(fields, chunk, w, docCounter, bytesCounter); } } }; threads[i].start(); } for (int i = 0; i < threadCount; i++) { threads[i].join(); } System.out.println("Indexing done; now close"); w.close(); docs.close(); }
From source file:luceneingester.TrecIngester.java
License:Apache License
public static void main(String[] clArgs) throws Exception { Args args = new Args(clArgs); final String dirPath = args.getString("-indexPath") + "/index"; final String dataDir = args.getString("-dataDir"); final int docCountLimit = args.getInt("-docCountLimit"); // -1 means all docs from the source: final int numThreads = args.getInt("-threadCount"); final boolean verbose = args.getFlag("-verbose"); final boolean printDPS = args.getFlag("-printDPS"); final boolean doUpdate = args.getFlag("-update"); final boolean positions = args.getFlag("-positions"); args.check();/*www . ja v a2 s . c om*/ final Analyzer a = new EnglishAnalyzer(); final TrecContentSource trecSource = createTrecSource(dataDir); final Directory dir = FSDirectory.open(Paths.get(dirPath)); System.out.println("Index path: " + dirPath); System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); System.out.println("Threads: " + numThreads); System.out.println("Verbose: " + (verbose ? "yes" : "no")); System.out.println("Positions: " + (positions ? "yes" : "no")); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } final IndexWriterConfig iwc = new IndexWriterConfig(a); if (doUpdate) { iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } System.out.println("IW config=" + iwc); final IndexWriter w = new IndexWriter(dir, iwc); IndexThreads threads = new IndexThreads(w, positions, trecSource, numThreads, docCountLimit, printDPS); System.out.println("\nIndexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done()) { Thread.sleep(100); } threads.stop(); final long t1 = System.currentTimeMillis(); System.out.println( "\nIndexer: indexing done (" + (t1 - t0) / 1000.0 + " sec); total " + w.maxDoc() + " docs"); if (!doUpdate && docCountLimit != -1 && w.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit); } if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } final long t2; t2 = System.currentTimeMillis(); final Map<String, String> commitData = new HashMap<String, String>(); commitData.put("userData", "multi"); w.setCommitData(commitData); w.commit(); final long t3 = System.currentTimeMillis(); System.out.println("\nIndexer: commit multi (took " + (t3 - t2) / 1000.0 + " sec)"); System.out.println("\nIndexer: at close: " + w.segString()); final long tCloseStart = System.currentTimeMillis(); w.close(); System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart) / 1000.0 + " sec"); dir.close(); final long tFinal = System.currentTimeMillis(); System.out.println("\nIndexer: finished (" + (tFinal - t0) / 1000.0 + " sec)"); System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed()); System.out.println( "\nIndexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / ((tFinal - t0) / 3600000.)) + " GB/hour plain text"); }
From source file:org.apache.solr.update.SolrIndexConfig.java
License:Apache License
/** * Constructs a SolrIndexConfig which parses the Lucene related config params in solrconfig.xml * @param solrConfig the overall SolrConfig object * @param prefix the XPath prefix for which section to parse (mandatory) * @param def a SolrIndexConfig instance to pick default values from (optional) *///from w w w . j av a 2 s . c o m @SuppressWarnings("deprecation") public SolrIndexConfig(SolrConfig solrConfig, String prefix, SolrIndexConfig def) { if (prefix == null) { prefix = "indexConfig"; log.debug("Defaulting to prefix \"" + prefix + "\" for index configuration"); } if (def == null) { def = new SolrIndexConfig(solrConfig); } // sanity check: this will throw an error for us if there is more then one // config section Object unused = solrConfig.getNode(prefix, false); luceneVersion = solrConfig.luceneMatchVersion; // Assert that end-of-life parameters or syntax is not in our config. // Warn for luceneMatchVersion's before LUCENE_36, fail fast above assertWarnOrFail( "The <mergeScheduler>myclass</mergeScheduler> syntax is no longer supported in solrconfig.xml. Please use syntax <mergeScheduler class=\"myclass\"/> instead.", !((solrConfig.getNode(prefix + "/mergeScheduler", false) != null) && (solrConfig.get(prefix + "/mergeScheduler/@class", null) == null)), true); assertWarnOrFail( "The <mergePolicy>myclass</mergePolicy> syntax is no longer supported in solrconfig.xml. Please use syntax <mergePolicy class=\"myclass\"/> instead.", !((solrConfig.getNode(prefix + "/mergePolicy", false) != null) && (solrConfig.get(prefix + "/mergePolicy/@class", null) == null)), true); assertWarnOrFail( "The <luceneAutoCommit>true|false</luceneAutoCommit> parameter is no longer valid in solrconfig.xml.", solrConfig.get(prefix + "/luceneAutoCommit", null) == null, true); defaultMergePolicyClassName = def.defaultMergePolicyClassName; useCompoundFile = solrConfig.getBool(prefix + "/useCompoundFile", def.useCompoundFile); effectiveUseCompountFileSetting = useCompoundFile; maxBufferedDocs = solrConfig.getInt(prefix + "/maxBufferedDocs", def.maxBufferedDocs); maxMergeDocs = solrConfig.getInt(prefix + "/maxMergeDocs", def.maxMergeDocs); maxIndexingThreads = solrConfig.getInt(prefix + "/maxIndexingThreads", def.maxIndexingThreads); mergeFactor = solrConfig.getInt(prefix + "/mergeFactor", def.mergeFactor); ramBufferSizeMB = solrConfig.getDouble(prefix + "/ramBufferSizeMB", def.ramBufferSizeMB); writeLockTimeout = solrConfig.getInt(prefix + "/writeLockTimeout", def.writeLockTimeout); lockType = solrConfig.get(prefix + "/lockType", def.lockType); mergeSchedulerInfo = getPluginInfo(prefix + "/mergeScheduler", solrConfig, def.mergeSchedulerInfo); mergePolicyInfo = getPluginInfo(prefix + "/mergePolicy", solrConfig, def.mergePolicyInfo); termIndexInterval = solrConfig.getInt(prefix + "/termIndexInterval", def.termIndexInterval); boolean infoStreamEnabled = solrConfig.getBool(prefix + "/infoStream", false); if (infoStreamEnabled) { String infoStreamFile = solrConfig.get(prefix + "/infoStream/@file", null); if (infoStreamFile == null) { log.info("IndexWriter infoStream solr logging is enabled"); infoStream = new LoggingInfoStream(); } else { log.warn("IndexWriter infoStream file log is enabled: " + infoStreamFile + "\nThis feature is deprecated. Remove @file from <infoStream> to output messages to solr's logfile"); File f = new File(infoStreamFile); File parent = f.getParentFile(); if (parent != null) parent.mkdirs(); try { FileOutputStream fos = new FileOutputStream(f, true); infoStream = new PrintStreamInfoStream(new PrintStream(fos, true, "UTF-8")); } catch (Exception e) { log.error("Could not create info stream for file " + infoStreamFile, e); } } } mergedSegmentWarmerInfo = getPluginInfo(prefix + "/mergedSegmentWarmer", solrConfig, def.mergedSegmentWarmerInfo); if (mergedSegmentWarmerInfo != null && solrConfig.nrtMode == false) { throw new IllegalArgumentException( "Supplying a mergedSegmentWarmer will do nothing since nrtMode is false"); } }
From source file:perf.AutoPrefixPerf.java
License:Apache License
public static void main(String[] args) throws Exception { String numbersFile = args[0]; String queriesFile = args[1]; Path indexPath = Paths.get(args[2]); int precStep = Integer.parseInt(args[3]); boolean useNumericField = (precStep != 0); int maxTermsInPrefix; int minTermsInPrefix; if (useNumericField == false) { minTermsInPrefix = Integer.parseInt(args[4]); maxTermsInPrefix = Integer.parseInt(args[5]); } else {/*from ww w . j ava 2 s. c o m*/ minTermsInPrefix = 0; maxTermsInPrefix = 0; } BytesRefBuilder binaryToken = new BytesRefBuilder(); binaryToken.grow(8); binaryToken.setLength(8); Directory dir = FSDirectory.open(indexPath); if (Files.notExists(indexPath) == false) { IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); iwc.setMaxBufferedDocs(30000); iwc.setRAMBufferSizeMB(-1); iwc.setMergePolicy(new LogDocMergePolicy()); final PostingsFormat pf; if (useNumericField) { // Disable auto-prefix when testing NumericField! if (minTermsInPrefix != 0) { throw new IllegalArgumentException("only precStep or minTermsInPrefix should be non-zero"); } pf = new Lucene50PostingsFormat(25, 48, 0, 0); } else { /* if (minTermsInPrefix == 0) { throw new IllegalArgumentException("one of precStep or minTermsInPrefix must be non-zero"); } */ pf = new Lucene50PostingsFormat(25, 48, minTermsInPrefix, maxTermsInPrefix); //pf = new Lucene50PostingsFormat(25, 48, minTermsInPrefix, Integer.MAX_VALUE); } iwc.setCodec(new Lucene53Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return pf; } }); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); iwc.setMergeScheduler(new SerialMergeScheduler()); //TieredMergePolicy tmp = (TieredMergePolicy) iwc.getMergePolicy(); //tmp.setFloorSegmentMB(.1); //ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler(); // More concurrency (for SSD) //cms.setMaxMergesAndThreads(5, 3); final IndexWriter w = new IndexWriter(dir, iwc); Document doc = new Document(); Field field; if (useNumericField) { FieldType longFieldType = new FieldType(LongField.TYPE_NOT_STORED); longFieldType.setNumericPrecisionStep(precStep); longFieldType.freeze(); field = new LongField("number", 0L, longFieldType); doc.add(field); } else { FieldType longFieldType = new FieldType(TextField.TYPE_NOT_STORED); longFieldType.setIndexOptions(IndexOptions.DOCS_ONLY); longFieldType.setOmitNorms(true); longFieldType.setIndexRanges(true); longFieldType.freeze(); field = new Field("number", new BinaryTokenStream(binaryToken.get()), longFieldType); doc.add(field); } long startMS = System.currentTimeMillis(); // 64K buffer: InputStream is = new FileInputStream(numbersFile); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); int count = 0; while (true) { String line = reader.readLine(); if (line == null) { break; } Long v = Long.parseLong(line.trim()); if (useNumericField) { field.setLongValue(v); } else { //NumericUtils.longToPrefixCoded(v, 0, binaryToken); longToBytes(v, binaryToken); //if (bytesToLong(binaryToken.get()) != v) { // throw new RuntimeException("wrong long: v=" + v + " vs " + bytesToLong(binaryToken.get())); //} } w.addDocument(doc); count++; if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println("Indexed " + count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } reader.close(); System.out.println( "Final Indexed " + count + ": " + ((System.currentTimeMillis() - startMS) / 1000.0) + " sec"); // nocommit just to make debugging easier: //System.out.println("Optimize..."); //w.forceMerge(1); System.out.println("Close..."); w.close(); System.out.println("After close: " + ((System.currentTimeMillis() - startMS) / 1000.0) + " sec"); // Print CheckIndex: ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); CheckIndex checker = new CheckIndex(dir); checker.setInfoStream(new PrintStream(bos, false, IOUtils.UTF_8), true); CheckIndex.Status status = checker.checkIndex(); System.out.println("Done CheckIndex:"); System.out.println(bos.toString(IOUtils.UTF_8)); if (status.clean == false) { throw new IllegalStateException("CheckIndex failed"); } SegmentInfos infos = new SegmentInfos(); infos.read(dir); long totBytes = 0; for (SegmentCommitInfo info : infos) { totBytes += info.sizeInBytes(); } System.out.println("\nTotal index size: " + totBytes + " bytes"); } else { System.out.println("Skip indexing: index already exists"); } List<Query> queries = new ArrayList<>(); InputStream is = new FileInputStream(queriesFile); BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] numbers = line.trim().split(" "); if (numbers.length != 2) { throw new IllegalArgumentException("could not parse query line: " + line); } long minValue = Long.parseLong(numbers[0]); long maxValue = Long.parseLong(numbers[1]); if (useNumericField) { queries.add(NumericRangeQuery.newLongRange("number", precStep, minValue, maxValue, true, true)); } else { longToBytes(minValue, binaryToken); BytesRef minTerm = binaryToken.toBytesRef(); longToBytes(maxValue, binaryToken); BytesRef maxTerm = binaryToken.toBytesRef(); queries.add(new TermRangeQuery("number", minTerm, maxTerm, true, true)); } if (queries.size() == 200) { break; } } DirectoryReader r = DirectoryReader.open(dir); IndexSearcher s = new IndexSearcher(r); s.setQueryCache(null); // don't bench the cache printQueryTerms((MultiTermQuery) queries.get(0), s); long bestMS = Long.MAX_VALUE; for (int iter = 0; iter < 10; iter++) { long startMS = System.currentTimeMillis(); long totalHits = 0; long hash = 0; for (Query query : queries) { TopDocs hits = s.search(query, 10); totalHits += hits.totalHits; hash = hash * 31 + hits.totalHits; } long ms = System.currentTimeMillis() - startMS; System.out.println("iter " + iter + ": " + ms + " msec; totalHits=" + totalHits + " hash=" + hash); if (ms < bestMS) { System.out.println(" **"); bestMS = ms; } } /* long t0 = System.currentTimeMillis(); long bytesUsed = 0; for(int i=0;i<1000;i++) { for(AtomicReaderContext ctx : r.leaves()) { bytesUsed += ((SegmentReader) ctx.reader()).ramBytesUsed(); } } System.out.println((System.currentTimeMillis() - t0) + " msec for 1000 ramBytesUsed: " + (bytesUsed / 1000)); */ r.close(); dir.close(); }
From source file:perf.IndexAndSearchOpenStreetMaps.java
License:Apache License
private static void createIndex(boolean fast, boolean doForceMerge, boolean doDistanceSort) throws IOException, InterruptedException { CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); int BUFFER_SIZE = 1 << 16; // 64K InputStream is;// w ww .jav a2 s .c o m if (SMALL) { is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.subsetPlusAllLondon.txt")); } else { is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.txt")); } BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); int NUM_THREADS; if (fast) { NUM_THREADS = 4; } else { NUM_THREADS = 1; } int CHUNK = 10000; long t0 = System.nanoTime(); AtomicLong totalCount = new AtomicLong(); for (int part = 0; part < NUM_PARTS; part++) { Directory dir = FSDirectory.open(Paths.get(getName(part, doDistanceSort))); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setCodec(getCodec(fast)); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); if (fast) { ((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(Double.POSITIVE_INFINITY); iwc.setRAMBufferSizeMB(1024); } else { iwc.setMaxBufferedDocs(109630); iwc.setMergePolicy(new LogDocMergePolicy()); iwc.setMergeScheduler(new SerialMergeScheduler()); } iwc.setInfoStream(new PrintStreamInfoStream(System.out)); IndexWriter w = new IndexWriter(dir, iwc); Thread[] threads = new Thread[NUM_THREADS]; AtomicBoolean finished = new AtomicBoolean(); Object lock = new Object(); final int finalPart = part; for (int t = 0; t < NUM_THREADS; t++) { threads[t] = new Thread() { @Override public void run() { String[] lines = new String[CHUNK]; int chunkCount = 0; while (finished.get() == false) { try { int count = CHUNK; synchronized (lock) { for (int i = 0; i < CHUNK; i++) { String line = reader.readLine(); if (line == null) { count = i; finished.set(true); break; } lines[i] = line; } if (finalPart == 0 && totalCount.get() + count >= 2000000000) { finished.set(true); } } for (int i = 0; i < count; i++) { String[] parts = lines[i].split(","); //long id = Long.parseLong(parts[0]); double lat = Double.parseDouble(parts[1]); double lon = Double.parseDouble(parts[2]); Document doc = new Document(); if (useGeoPoint) { doc.add(new GeoPointField("point", lat, lon, Field.Store.NO)); } else if (useGeo3D || useGeo3DLarge) { doc.add(new Geo3DPoint("point", lat, lon)); } else { doc.add(new LatLonPoint("point", lat, lon)); if (doDistanceSort) { doc.add(new LatLonDocValuesField("point", lat, lon)); } } w.addDocument(doc); long x = totalCount.incrementAndGet(); if (x % 1000000 == 0) { System.out.println(x + "..."); } } chunkCount++; if (false && SMALL == false && chunkCount == 20000) { System.out.println("NOW BREAK EARLY"); break; } } catch (IOException ioe) { throw new RuntimeException(ioe); } } } }; threads[t].start(); } for (Thread thread : threads) { thread.join(); } System.out.println("Part " + part + " is done: w.maxDoc()=" + w.maxDoc()); w.commit(); System.out.println("done commit"); long t1 = System.nanoTime(); System.out.println(((t1 - t0) / 1000000000.0) + " sec to index part " + part); if (doForceMerge) { w.forceMerge(1); long t2 = System.nanoTime(); System.out.println(((t2 - t1) / 1000000000.0) + " sec to force merge part " + part); } w.close(); } //System.out.println(totalCount.get() + " total docs"); //System.out.println("Force merge..."); //w.forceMerge(1); //long t2 = System.nanoTime(); //System.out.println(((t2-t1)/1000000000.0) + " sec to force merge"); //w.close(); //long t3 = System.nanoTime(); //System.out.println(((t3-t2)/1000000000.0) + " sec to close"); //System.out.println(((t3-t2)/1000000000.0) + " sec to close"); }
From source file:perf.Indexer.java
License:Apache License
private static void _main(String[] clArgs) throws Exception { Args args = new Args(clArgs); // EG: -facets Date -facets characterCount ... FacetsConfig facetsConfig = new FacetsConfig(); facetsConfig.setHierarchical("Date", true); final Set<String> facetFields = new HashSet<String>(); if (args.hasArg("-facets")) { for (String arg : args.getStrings("-facets")) { facetFields.add(arg);// w w w . ja v a 2 s . c om } } final String dirImpl = args.getString("-dirImpl"); final String dirPath = args.getString("-indexPath") + "/index"; final Directory dir; OpenDirectory od = OpenDirectory.get(dirImpl); dir = od.open(Paths.get(dirPath)); final String analyzer = args.getString("-analyzer"); final Analyzer a; if (analyzer.equals("EnglishAnalyzer")) { a = new EnglishAnalyzer(); } else if (analyzer.equals("StandardAnalyzer")) { a = new StandardAnalyzer(); } else if (analyzer.equals("StandardAnalyzerNoStopWords")) { a = new StandardAnalyzer(CharArraySet.EMPTY_SET); } else if (analyzer.equals("ShingleStandardAnalyzer")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(), 2, 2); } else if (analyzer.equals("ShingleStandardAnalyzerNoStopWords")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2); } else { throw new RuntimeException("unknown analyzer " + analyzer); } final String lineFile = args.getString("-lineDocsFile"); // -1 means all docs in the line file: final int docCountLimit = args.getInt("-docCountLimit"); final int numThreads = args.getInt("-threadCount"); final boolean doForceMerge = args.getFlag("-forceMerge"); final boolean verbose = args.getFlag("-verbose"); String indexSortField = null; SortField.Type indexSortType = null; if (args.hasArg("-indexSort")) { indexSortField = args.getString("-indexSort"); int i = indexSortField.indexOf(':'); if (i == -1) { throw new IllegalArgumentException( "-indexSort should have form field:type; got: " + indexSortField); } String typeString = indexSortField.substring(i + 1, indexSortField.length()); if (typeString.equals("long")) { indexSortType = SortField.Type.LONG; } else if (typeString.equals("string")) { indexSortType = SortField.Type.STRING; } else { throw new IllegalArgumentException("-indexSort can only handle 'long' sort; got: " + typeString); } indexSortField = indexSortField.substring(0, i); } else { indexSortType = null; } final double ramBufferSizeMB = args.getDouble("-ramBufferMB"); final int maxBufferedDocs = args.getInt("-maxBufferedDocs"); final String defaultPostingsFormat = args.getString("-postingsFormat"); final boolean doDeletions = args.getFlag("-deletions"); final boolean printDPS = args.getFlag("-printDPS"); final boolean waitForMerges = args.getFlag("-waitForMerges"); final boolean waitForCommit = args.getFlag("-waitForCommit"); final String mergePolicy = args.getString("-mergePolicy"); final Mode mode; final boolean doUpdate = args.getFlag("-update"); if (doUpdate) { mode = Mode.UPDATE; } else { mode = Mode.valueOf(args.getString("-mode", "add").toUpperCase(Locale.ROOT)); } int randomDocIDMax; if (mode == Mode.UPDATE) { randomDocIDMax = args.getInt("-randomDocIDMax"); } else { randomDocIDMax = -1; } final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat"); final boolean addGroupingFields = args.getFlag("-grouping"); final boolean useCFS = args.getFlag("-cfs"); final boolean storeBody = args.getFlag("-store"); final boolean tvsBody = args.getFlag("-tvs"); final boolean bodyPostingsOffsets = args.getFlag("-bodyPostingsOffsets"); final int maxConcurrentMerges = args.getInt("-maxConcurrentMerges"); final boolean addDVFields = args.getFlag("-dvfields"); final boolean doRandomCommit = args.getFlag("-randomCommit"); final boolean useCMS = args.getFlag("-useCMS"); final boolean disableIOThrottle = args.getFlag("-disableIOThrottle"); if (waitForCommit == false && waitForMerges) { throw new RuntimeException("pass -waitForCommit if you pass -waitForMerges"); } if (waitForCommit == false && doForceMerge) { throw new RuntimeException("pass -waitForCommit if you pass -forceMerge"); } if (waitForCommit == false && doDeletions) { throw new RuntimeException("pass -waitForCommit if you pass -deletions"); } if (useCMS == false && disableIOThrottle) { throw new RuntimeException("-disableIOThrottle only makes sense with -useCMS"); } final double nrtEverySec; if (args.hasArg("-nrtEverySec")) { nrtEverySec = args.getDouble("-nrtEverySec"); } else { nrtEverySec = -1.0; } // True to start back at the beginning if we run out of // docs from the line file source: final boolean repeatDocs = args.getFlag("-repeatDocs"); final String facetDVFormatName; if (facetFields.isEmpty()) { facetDVFormatName = "Lucene54"; } else { facetDVFormatName = args.getString("-facetDVFormat"); } if (addGroupingFields && docCountLimit == -1) { a.close(); throw new RuntimeException("cannot add grouping fields unless docCount is set"); } args.check(); System.out.println("Dir: " + dirImpl); System.out.println("Index path: " + dirPath); System.out.println("Analyzer: " + analyzer); System.out.println("Line file: " + lineFile); System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); System.out.println("Threads: " + numThreads); System.out.println("Force merge: " + (doForceMerge ? "yes" : "no")); System.out.println("Verbose: " + (verbose ? "yes" : "no")); System.out.println("RAM Buffer MB: " + ramBufferSizeMB); System.out.println("Max buffered docs: " + maxBufferedDocs); System.out.println("Default postings format: " + defaultPostingsFormat); System.out.println("Do deletions: " + (doDeletions ? "yes" : "no")); System.out.println("Wait for merges: " + (waitForMerges ? "yes" : "no")); System.out.println("Wait for commit: " + (waitForCommit ? "yes" : "no")); System.out.println("IO throttle: " + (disableIOThrottle ? "no" : "yes")); System.out.println("Merge policy: " + mergePolicy); System.out.println("Mode: " + mode); if (mode == Mode.UPDATE) { System.out.println("DocIDMax: " + randomDocIDMax); } System.out.println("ID field postings format: " + idFieldPostingsFormat); System.out.println("Add grouping fields: " + (addGroupingFields ? "yes" : "no")); System.out.println("Compound file format: " + (useCFS ? "yes" : "no")); System.out.println("Store body field: " + (storeBody ? "yes" : "no")); System.out.println("Term vectors for body field: " + (tvsBody ? "yes" : "no")); System.out.println("Facet DV Format: " + facetDVFormatName); System.out.println("Facet fields: " + facetFields); System.out.println("Body postings offsets: " + (bodyPostingsOffsets ? "yes" : "no")); System.out.println("Max concurrent merges: " + maxConcurrentMerges); System.out.println("Add DocValues fields: " + addDVFields); System.out.println("Use ConcurrentMergeScheduler: " + useCMS); if (nrtEverySec > 0.0) { System.out.println("Open & close NRT reader every: " + nrtEverySec + " sec"); } else { System.out.println("Open & close NRT reader every: never"); } System.out.println("Repeat docs: " + repeatDocs); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } final IndexWriterConfig iwc = new IndexWriterConfig(a); if (indexSortField != null) { iwc.setIndexSort(new Sort(new SortField(indexSortField, indexSortType))); } if (mode == Mode.UPDATE) { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } iwc.setMaxBufferedDocs(maxBufferedDocs); iwc.setRAMBufferSizeMB(ramBufferSizeMB); // So flushed segments do/don't use CFS: iwc.setUseCompoundFile(useCFS); final AtomicBoolean indexingFailed = new AtomicBoolean(); iwc.setMergeScheduler(getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle)); iwc.setMergePolicy(getMergePolicy(mergePolicy, useCFS)); // Keep all commit points: if (doDeletions || doForceMerge) { iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); } final Codec codec = new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return PostingsFormat.forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat); } private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName(facetDVFormatName); //private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42"); //private final DocValuesFormat diskDVFormat = DocValuesFormat.forName("Disk"); // private final DocValuesFormat lucene45DVFormat = DocValuesFormat.forName("Lucene45"); private final DocValuesFormat directDVFormat = DocValuesFormat.forName("Direct"); @Override public DocValuesFormat getDocValuesFormatForField(String field) { if (facetFields.contains(field) || field.equals("$facets")) { return facetsDVFormat; //} else if (field.equals("$facets_sorted_doc_values")) { //return diskDVFormat; } else { // Use default DVFormat for all else: // System.out.println("DV: field=" + field + " format=" + super.getDocValuesFormatForField(field)); return super.getDocValuesFormatForField(field); } } }; iwc.setCodec(codec); System.out.println("IW config=" + iwc); IndexWriter w = new IndexWriter(dir, iwc); System.out.println("Index has " + w.maxDoc() + " docs"); final TaxonomyWriter taxoWriter; if (facetFields.isEmpty() == false) { taxoWriter = new DirectoryTaxonomyWriter(od.open(Paths.get(args.getString("-indexPath"), "facets")), IndexWriterConfig.OpenMode.CREATE); } else { taxoWriter = null; } // Fixed seed so group field values are always consistent: final Random random = new Random(17); LineFileDocs lineFileDocs = new LineFileDocs(lineFile, repeatDocs, storeBody, tvsBody, bodyPostingsOffsets, false, taxoWriter, facetFields, facetsConfig, addDVFields); float docsPerSecPerThread = -1f; //float docsPerSecPerThread = 100f; IndexThreads threads = new IndexThreads(random, w, indexingFailed, lineFileDocs, numThreads, docCountLimit, addGroupingFields, printDPS, mode, docsPerSecPerThread, null, nrtEverySec, randomDocIDMax); System.out.println("\nIndexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done() && indexingFailed.get() == false) { Thread.sleep(100); // Commits once per minute on average: if (doRandomCommit && random.nextInt(600) == 17) { System.out.println("Indexer: now commit"); long commitStartNS = System.nanoTime(); w.commit(); System.out.println(String.format(Locale.ROOT, "Indexer: commit took %.1f msec", (System.nanoTime() - commitStartNS) / 1000000.)); } } threads.stop(); final long t1 = System.currentTimeMillis(); System.out.println("\nIndexer: indexing done (" + (t1 - t0) + " msec); total " + w.maxDoc() + " docs"); // if we update we can not tell how many docs if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } if (mode != Mode.UPDATE && docCountLimit != -1 && w.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit); } final Map<String, String> commitData = new HashMap<String, String>(); if (waitForMerges) { w.close(); IndexWriterConfig iwc2 = new IndexWriterConfig(a); iwc2.setMergeScheduler( getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle)); iwc2.setMergePolicy(getMergePolicy(mergePolicy, useCFS)); iwc2.setCodec(codec); iwc2.setUseCompoundFile(useCFS); iwc2.setMaxBufferedDocs(maxBufferedDocs); iwc2.setRAMBufferSizeMB(ramBufferSizeMB); if (indexSortField != null) { iwc2.setIndexSort(new Sort(new SortField(indexSortField, indexSortType))); } w = new IndexWriter(dir, iwc2); long t2 = System.currentTimeMillis(); System.out.println("\nIndexer: waitForMerges done (" + (t2 - t1) + " msec)"); } if (waitForCommit) { commitData.put("userData", "multi"); w.setLiveCommitData(commitData.entrySet()); long t2 = System.currentTimeMillis(); w.commit(); long t3 = System.currentTimeMillis(); System.out.println("\nIndexer: commit multi (took " + (t3 - t2) + " msec)"); } else { w.rollback(); w = null; } if (doForceMerge) { long forceMergeStartMSec = System.currentTimeMillis(); w.forceMerge(1); long forceMergeEndMSec = System.currentTimeMillis(); System.out.println( "\nIndexer: force merge done (took " + (forceMergeEndMSec - forceMergeStartMSec) + " msec)"); commitData.put("userData", "single"); w.setLiveCommitData(commitData.entrySet()); w.commit(); final long t5 = System.currentTimeMillis(); System.out.println("\nIndexer: commit single done (took " + (t5 - forceMergeEndMSec) + " msec)"); } if (doDeletions) { final long t5 = System.currentTimeMillis(); // Randomly delete 5% of the docs final Set<Integer> deleted = new HashSet<Integer>(); final int maxDoc = w.maxDoc(); final int toDeleteCount = (int) (maxDoc * 0.05); System.out.println("\nIndexer: delete " + toDeleteCount + " docs"); while (deleted.size() < toDeleteCount) { final int id = random.nextInt(maxDoc); if (!deleted.contains(id)) { deleted.add(id); w.deleteDocuments(new Term("id", LineFileDocs.intToID(id))); } } final long t6 = System.currentTimeMillis(); System.out.println("\nIndexer: deletes done (took " + (t6 - t5) + " msec)"); commitData.put("userData", doForceMerge ? "delsingle" : "delmulti"); w.setLiveCommitData(commitData.entrySet()); w.commit(); final long t7 = System.currentTimeMillis(); System.out.println("\nIndexer: commit delmulti done (took " + (t7 - t6) + " msec)"); if (doUpdate || w.numDocs() != maxDoc - toDeleteCount) { throw new RuntimeException( "count mismatch: w.numDocs()=" + w.numDocs() + " but expected " + (maxDoc - toDeleteCount)); } } if (taxoWriter != null) { System.out.println("Taxonomy has " + taxoWriter.getSize() + " ords"); taxoWriter.commit(); taxoWriter.close(); } final long tCloseStart = System.currentTimeMillis(); if (w != null) { w.close(); w = null; } if (waitForCommit) { System.out.println("\nIndexer: at close: " + SegmentInfos.readLatestCommit(dir)); System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart) + " msec"); } dir.close(); final long tFinal = System.currentTimeMillis(); System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed()); final long indexingTime; if (waitForCommit) { indexingTime = tFinal - t0; System.out.println("\nIndexer: finished (" + indexingTime + " msec)"); } else { indexingTime = t1 - t0; System.out.println("\nIndexer: finished (" + indexingTime + " msec), excluding commit"); } System.out.println( "\nIndexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / (indexingTime / 3600000.)) + " GB/hour plain text"); }
From source file:perf.IndexGeoNames.java
License:Apache License
public static void main(String[] args) throws Exception { String geoNamesFile = args[0]; File indexPath = new File(args[1]); int numThreads = Integer.parseInt(args[2]); int precStep = Integer.parseInt(args[3]); if (indexPath.exists()) { throw new IllegalArgumentException("please remove indexPath \"" + indexPath + "\" before running"); }/*from www . j a v a 2 s. co m*/ Directory dir = FSDirectory.open(indexPath); //IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_48, new StandardAnalyzer(Version.LUCENE_48)); IndexWriterConfig iwc = new IndexWriterConfig(new StandardAnalyzer()); //iwc.setRAMBufferSizeMB(350); iwc.setInfoStream(new PrintStreamInfoStream(System.out)); if (normal == false) { iwc.setRAMBufferSizeMB(1024); iwc.setMergePolicy(NoMergePolicy.INSTANCE); //iwc.setMergePolicy(NoMergePolicy.NO_COMPOUND_FILES); } else { // 5/5 segments: iwc.setMaxBufferedDocs(157234); iwc.setRAMBufferSizeMB(-1); } //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).setMaxMergesAndThreads(3, 1); final IndexWriter w = new IndexWriter(dir, iwc); final Field.Store store = Field.Store.NO; final FieldType doubleFieldType = new FieldType( store == Field.Store.NO ? DoubleField.TYPE_NOT_STORED : DoubleField.TYPE_STORED); doubleFieldType.setNumericPrecisionStep(precStep); doubleFieldType.freeze(); final FieldType longFieldType = new FieldType( store == Field.Store.NO ? LongField.TYPE_NOT_STORED : LongField.TYPE_STORED); longFieldType.setNumericPrecisionStep(precStep); longFieldType.freeze(); final FieldType intFieldType = new FieldType( store == Field.Store.NO ? IntField.TYPE_NOT_STORED : IntField.TYPE_STORED); intFieldType.setNumericPrecisionStep(precStep); intFieldType.freeze(); // 64K buffer: InputStream is = new FileInputStream(geoNamesFile); final BufferedReader reader = new BufferedReader(new InputStreamReader(is, "UTF-8"), 1 << 16); final AtomicInteger docsIndexed = new AtomicInteger(); final long startMS = System.currentTimeMillis(); Thread[] threads = new Thread[numThreads]; // With reuse it's ~ 38% faster (41.8 sec vs 67.0 sec): final boolean reuseDocAndFields = false; for (int i = 0; i < numThreads; i++) { threads[i] = new Thread() { @Override public void run() { ParsePosition datePos = new ParsePosition(0); SimpleDateFormat dateParser = new SimpleDateFormat("yyyy-MM-dd", Locale.US); if (reuseDocAndFields) { Document doc = new Document(); IntField geoNameID = new IntField("geoNameID", 0, intFieldType); doc.add(geoNameID); TextField nameField = new TextField("name", "", store); doc.add(nameField); TextField asciiNameField = new TextField("asciiName", "", store); doc.add(asciiNameField); TextField alternateNameField = new TextField("alternateNames", "", store); doc.add(alternateNameField); StringField featureClassField = new StringField("featureClass", "", store); doc.add(featureClassField); StringField featureCodeField = new StringField("featureCode", "", store); doc.add(featureCodeField); StringField countryCodeField = new StringField("countryCode", "", store); doc.add(countryCodeField); StringField cc2Field = new StringField("cc2", "", store); doc.add(cc2Field); StringField admin1Field = new StringField("admin1", "", store); doc.add(admin1Field); StringField admin2Field = new StringField("admin2", "", store); doc.add(admin2Field); StringField admin3Field = new StringField("admin3", "", store); doc.add(admin3Field); StringField admin4Field = new StringField("admin4", "", store); doc.add(admin4Field); StringField tzField = new StringField("timezone", "", store); doc.add(tzField); while (true) { try { // Curiously BufferedReader.readLine seems to be thread-safe... String line = reader.readLine(); if (line == null) { break; } String[] values = line.split("\t"); geoNameID.setIntValue(Integer.parseInt(values[0])); nameField.setStringValue(values[1]); asciiNameField.setStringValue(values[2]); alternateNameField.setStringValue(values[3]); /* if (values[4].isEmpty() == false) { double v = Double.parseDouble(values[4]); doc.add(new DoubleField("latitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("latitude", v)); } if (values[5].isEmpty() == false) { double v = Double.parseDouble(values[5]); doc.add(new DoubleField("longitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("longitude", v)); } */ featureClassField.setStringValue(values[6]); featureCodeField.setStringValue(values[7]); countryCodeField.setStringValue(values[8]); cc2Field.setStringValue(values[9]); admin1Field.setStringValue(values[10]); admin2Field.setStringValue(values[11]); admin3Field.setStringValue(values[12]); admin4Field.setStringValue(values[13]); /* if (values[14].isEmpty() == false) { long v = Long.parseLong(values[14]); doc.add(new LongField("population", v, longFieldType)); doc.add(new NumericDocValuesField("population", v)); } if (values[15].isEmpty() == false) { long v = Long.parseLong(values[15]); doc.add(new LongField("elevation", v, longFieldType)); doc.add(new NumericDocValuesField("elevation", v)); } if (values[16].isEmpty() == false) { doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType)); } */ tzField.setStringValue(values[17]); /* if (values[18].isEmpty() == false) { datePos.setIndex(0); Date date = dateParser.parse(values[18], datePos); doc.add(new LongField("modified", date.getTime(), longFieldType)); } */ w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } catch (Exception e) { throw new RuntimeException(e); } } } else { while (true) { try { // Curiously BufferedReader.readLine seems to be thread-safe... String line = reader.readLine(); if (line == null) { break; } String[] values = line.split("\t"); Document doc = new Document(); doc.add(new IntField("geoNameID", Integer.parseInt(values[0]), intFieldType)); doc.add(new TextField("name", values[1], store)); doc.add(new TextField("asciiName", values[2], store)); doc.add(new TextField("alternateNames", values[3], store)); if (values[4].isEmpty() == false) { double v = Double.parseDouble(values[4]); doc.add(new DoubleField("latitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("latitude", v)); } if (values[5].isEmpty() == false) { double v = Double.parseDouble(values[5]); doc.add(new DoubleField("longitude", v, doubleFieldType)); doc.add(new DoubleDocValuesField("longitude", v)); } doc.add(new StringField("featureClass", values[6], store)); doc.add(new StringField("featureCode", values[7], store)); doc.add(new StringField("countryCode", values[8], store)); doc.add(new StringField("cc2", values[9], store)); doc.add(new StringField("admin1Code", values[10], store)); doc.add(new StringField("admin2Code", values[11], store)); doc.add(new StringField("admin3Code", values[12], store)); doc.add(new StringField("admin4Code", values[13], store)); if (values[14].isEmpty() == false) { long v = Long.parseLong(values[14]); doc.add(new LongField("population", v, longFieldType)); doc.add(new NumericDocValuesField("population", v)); } if (values[15].isEmpty() == false) { long v = Long.parseLong(values[15]); doc.add(new LongField("elevation", v, longFieldType)); doc.add(new NumericDocValuesField("elevation", v)); } if (values[16].isEmpty() == false) { doc.add(new IntField("dem", Integer.parseInt(values[16]), intFieldType)); } doc.add(new StringField("timezone", values[17], store)); if (values[18].isEmpty() == false) { datePos.setIndex(0); Date date = dateParser.parse(values[18], datePos); doc.add(new LongField("modified", date.getTime(), longFieldType)); } w.addDocument(doc); int count = docsIndexed.incrementAndGet(); if (count % 200000 == 0) { long ms = System.currentTimeMillis(); System.out.println(count + ": " + ((ms - startMS) / 1000.0) + " sec"); } } catch (Exception e) { throw new RuntimeException(e); } } } } }; threads[i].start(); } DirectoryReader r = DirectoryReader.open(w, true); for (int i = 0; i < 100; i++) { DirectoryReader r2 = DirectoryReader.openIfChanged(r); if (r2 != null) { r.close(); r = r2; } Thread.sleep(500); } if (r != null) { r.close(); r = null; } for (int i = 0; i < numThreads; i++) { threads[i].join(); } long ms = System.currentTimeMillis(); System.out.println(docsIndexed + ": " + ((ms - startMS) / 1000.0) + " sec"); //System.out.println("tot conflicts: " + BytesRefHash.totConflict); //w.shutdown(normal); w.close(); dir.close(); }
From source file:perf.IndexGeoNames2.java
License:Apache License
public static void main(String args[]) throws Exception { String geoNamesFile = args[0]; File indexPath = new File(args[1]); Directory dir = FSDirectory.open(indexPath); IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(null).setRAMBufferSizeMB(50) //.setRAMBufferSizeMB(1) .setOpenMode(IndexWriterConfig.OpenMode.CREATE).setMergePolicy(NoMergePolicy.INSTANCE) .setInfoStream(new PrintStreamInfoStream(System.out))); FileInputStream fs = new FileInputStream(geoNamesFile); BufferedReader r = new BufferedReader(new InputStreamReader(fs, "UTF-8")); String line = null;// w w w . j a va 2s . c om Document doc = new Document(); Field fields[] = new Field[19]; for (int i = 0; i < fields.length; i++) { fields[i] = new StringField("" + i, "", Field.Store.NO); doc.add(fields[i]); } int docCount = 0; long prev = System.currentTimeMillis(); while ((line = r.readLine()) != null) { if ((++docCount % 10000) == 0) { long curr = System.currentTimeMillis(); System.out.println("Indexed: " + docCount + " (" + (curr - prev) + ")"); prev = curr; } String parts[] = line.split("\t"); for (int i = 0; i < fields.length; i++) { fields[i].setStringValue(parts[i]); } iw.addDocument(doc); } r.close(); iw.close(); dir.close(); }
From source file:perf.SearchPerfTest.java
License:Apache License
private static void _main(String[] clArgs) throws Exception { // args: dirImpl indexPath numThread numIterPerThread // eg java SearchPerfTest /path/to/index 4 100 final Args args = new Args(clArgs); Directory dir0;/* ww w. j a v a 2s . c o m*/ final String dirPath = args.getString("-indexPath") + "/index"; final String dirImpl = args.getString("-dirImpl"); OpenDirectory od = OpenDirectory.get(dirImpl); /* } else if (dirImpl.equals("NativePosixMMapDirectory")) { dir0 = new NativePosixMMapDirectory(new File(dirPath)); ramDir = null; if (doFacets) { facetsDir = new NativePosixMMapDirectory(new File(facetsDirPath)); } } else if (dirImpl.equals("CachingDirWrapper")) { dir0 = new CachingRAMDirectory(new MMapDirectory(new File(dirPath))); ramDir = null; } else if (dirImpl.equals("RAMExceptDirectPostingsDirectory")) { // Load only non-postings files into RAMDir (assumes // Lucene40PF is the wrapped PF): Set<String> postingsExtensions = new HashSet<String>(); postingsExtensions.add("frq"); postingsExtensions.add("prx"); postingsExtensions.add("tip"); postingsExtensions.add("tim"); ramDir = new RAMDirectory(); Directory fsDir = new MMapDirectory(new File(dirPath)); for (String file : fsDir.listAll()) { int idx = file.indexOf('.'); if (idx != -1 && postingsExtensions.contains(file.substring(idx+1, file.length()))) { continue; } fsDir.copy(ramDir, file, file, IOContext.READ); } dir0 = new FileSwitchDirectory(postingsExtensions, fsDir, ramDir, true); if (doFacets) { facetsDir = new RAMDirectory(new SimpleFSDirectory(new File(facetsDirPath)), IOContext.READ); } */ final RAMDirectory ramDir; dir0 = od.open(Paths.get(dirPath)); if (dir0 instanceof RAMDirectory) { ramDir = (RAMDirectory) dir0; } else { ramDir = null; } // TODO: NativeUnixDir? final String analyzer = args.getString("-analyzer"); final String tasksFile = args.getString("-taskSource"); final int searchThreadCount = args.getInt("-searchThreadCount"); final String fieldName = args.getString("-field"); final boolean printHeap = args.getFlag("-printHeap"); final boolean doPKLookup = args.getFlag("-pk"); final int topN = args.getInt("-topN"); final boolean doStoredLoads = args.getFlag("-loadStoredFields"); // Used to choose which random subset of tasks we will // run, to generate the PKLookup tasks, and to generate // any random pct filters: final long staticRandomSeed = args.getLong("-staticSeed"); // Used to shuffle the random subset of tasks: final long randomSeed = args.getLong("-seed"); // TODO: this could be way better. final String similarity = args.getString("-similarity"); // now reflect final Class<? extends Similarity> simClazz = Class .forName("org.apache.lucene.search.similarities." + similarity).asSubclass(Similarity.class); final Similarity sim = simClazz.newInstance(); System.out.println("Using dir impl " + dir0.getClass().getName()); System.out.println("Analyzer " + analyzer); System.out.println("Similarity " + similarity); System.out.println("Search thread count " + searchThreadCount); System.out.println("topN " + topN); System.out.println("JVM " + (Constants.JRE_IS_64BIT ? "is" : "is not") + " 64bit"); System.out.println("Pointer is " + RamUsageEstimator.NUM_BYTES_OBJECT_REF + " bytes"); final Analyzer a; if (analyzer.equals("EnglishAnalyzer")) { a = new EnglishAnalyzer(); } else if (analyzer.equals("ClassicAnalyzer")) { a = new ClassicAnalyzer(); } else if (analyzer.equals("StandardAnalyzer")) { a = new StandardAnalyzer(); } else if (analyzer.equals("StandardAnalyzerNoStopWords")) { a = new StandardAnalyzer(CharArraySet.EMPTY_SET); } else if (analyzer.equals("ShingleStandardAnalyzer")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, true, true, ShingleFilter.DEFAULT_FILLER_TOKEN); } else { throw new RuntimeException("unknown analyzer " + analyzer); } final ReferenceManager<IndexSearcher> mgr; final IndexWriter writer; final Directory dir; final String commit = args.getString("-commit"); final String hiliteImpl = args.getString("-hiliteImpl"); final String logFile = args.getString("-log"); final long tSearcherStart = System.currentTimeMillis(); final boolean verifyCheckSum = !args.getFlag("-skipVerifyChecksum"); final boolean recacheFilterDeletes = args.getFlag("-recacheFilterDeletes"); if (recacheFilterDeletes) { throw new UnsupportedOperationException("recacheFilterDeletes was deprecated"); } if (args.getFlag("-nrt")) { // TODO: get taxoReader working here too // TODO: factor out & share this CL processing w/ Indexer final int indexThreadCount = args.getInt("-indexThreadCount"); final String lineDocsFile = args.getString("-lineDocsFile"); final float docsPerSecPerThread = args.getFloat("-docsPerSecPerThread"); final float reopenEverySec = args.getFloat("-reopenEverySec"); final boolean storeBody = args.getFlag("-store"); final boolean tvsBody = args.getFlag("-tvs"); final boolean useCFS = args.getFlag("-cfs"); final String defaultPostingsFormat = args.getString("-postingsFormat"); final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat"); final boolean verbose = args.getFlag("-verbose"); final boolean cloneDocs = args.getFlag("-cloneDocs"); final Mode mode = Mode.valueOf(args.getString("-mode", "update").toUpperCase(Locale.ROOT)); final long reopenEveryMS = (long) (1000 * reopenEverySec); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } if (!dirImpl.equals("RAMDirectory") && !dirImpl.equals("RAMExceptDirectPostingsDirectory")) { System.out.println("Wrap NRTCachingDirectory"); dir0 = new NRTCachingDirectory(dir0, 20, 400.0); } dir = dir0; final IndexWriterConfig iwc = new IndexWriterConfig(a); iwc.setOpenMode(IndexWriterConfig.OpenMode.APPEND); iwc.setRAMBufferSizeMB(256.0); iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); // TODO: also RAMDirExceptDirect...? need to // ... block deletes against wrapped FSDir? if (dirImpl.equals("RAMDirectory")) { // Let IW remove files only referenced by starting commit: iwc.setIndexDeletionPolicy(new KeepNoCommitsDeletionPolicy()); } if (commit != null && commit.length() > 0) { System.out.println("Opening writer on commit=" + commit); iwc.setIndexCommit(PerfUtils.findCommitPoint(commit, dir)); } ((TieredMergePolicy) iwc.getMergePolicy()).setNoCFSRatio(useCFS ? 1.0 : 0.0); //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(1024); //((TieredMergePolicy) iwc.getMergePolicy()).setReclaimDeletesWeight(3.0); //((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergeAtOnce(4); final Codec codec = new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return PostingsFormat .forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat); } }; iwc.setCodec(codec); final ConcurrentMergeScheduler cms = (ConcurrentMergeScheduler) iwc.getMergeScheduler(); // Only let one merge run at a time... // ... but queue up up to 4, before index thread is stalled: cms.setMaxMergesAndThreads(4, 1); iwc.setMergedSegmentWarmer(new IndexWriter.IndexReaderWarmer() { @Override public void warm(LeafReader reader) throws IOException { final long t0 = System.currentTimeMillis(); //System.out.println("DO WARM: " + reader); IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.search(new TermQuery(new Term(fieldName, "united")), 10); final long t1 = System.currentTimeMillis(); System.out.println("warm segment=" + reader + " numDocs=" + reader.numDocs() + ": took " + (t1 - t0) + " msec"); } }); writer = new IndexWriter(dir, iwc); System.out.println("Initial writer.maxDoc()=" + writer.maxDoc()); // TODO: add -nrtBodyPostingsOffsets instead of // hardwired false: boolean addDVFields = mode == Mode.BDV_UPDATE || mode == Mode.NDV_UPDATE; LineFileDocs lineFileDocs = new LineFileDocs(lineDocsFile, false, storeBody, tvsBody, false, cloneDocs, null, null, null, addDVFields); IndexThreads threads = new IndexThreads(new Random(17), writer, new AtomicBoolean(false), lineFileDocs, indexThreadCount, -1, false, false, mode, docsPerSecPerThread, null, -1.0, -1); threads.start(); mgr = new SearcherManager(writer, new SearcherFactory() { @Override public IndexSearcher newSearcher(IndexReader reader, IndexReader previous) { IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.setSimilarity(sim); return s; } }); System.out.println("reopen every " + reopenEverySec); Thread reopenThread = new Thread() { @Override public void run() { try { final long startMS = System.currentTimeMillis(); int reopenCount = 1; while (true) { final long sleepMS = startMS + (reopenCount * reopenEveryMS) - System.currentTimeMillis(); if (sleepMS < 0) { System.out.println("WARNING: reopen fell behind by " + Math.abs(sleepMS) + " ms"); } else { Thread.sleep(sleepMS); } Thread.sleep(sleepMS); mgr.maybeRefresh(); reopenCount++; IndexSearcher s = mgr.acquire(); try { if (ramDir != null) { System.out.println(String.format(Locale.ENGLISH, "%.1fs: index: %d bytes in RAMDir; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d", (System.currentTimeMillis() - startMS) / 1000.0, ramDir.ramBytesUsed(), writer.maxDoc(), s.getIndexReader().maxDoc(), s.getIndexReader().numDocs())); //String[] l = ramDir.listAll(); //Arrays.sort(l); //for(String f : l) { //System.out.println(" " + f + ": " + ramDir.fileLength(f)); //} } else { System.out.println(String.format(Locale.ENGLISH, "%.1fs: done reopen; writer.maxDoc()=%d; searcher.maxDoc()=%d; searcher.numDocs()=%d", (System.currentTimeMillis() - startMS) / 1000.0, writer.maxDoc(), s.getIndexReader().maxDoc(), s.getIndexReader().numDocs())); } } finally { mgr.release(s); } } } catch (Exception e) { throw new RuntimeException(e); } } }; reopenThread.setName("ReopenThread"); reopenThread.setPriority(4 + Thread.currentThread().getPriority()); reopenThread.start(); } else { dir = dir0; writer = null; final DirectoryReader reader; if (commit != null && commit.length() > 0) { System.out.println("Opening searcher on commit=" + commit); reader = DirectoryReader.open(PerfUtils.findCommitPoint(commit, dir)); } else { // open last commit reader = DirectoryReader.open(dir); } IndexSearcher s = new IndexSearcher(reader); s.setQueryCache(null); // don't bench the cache s.setSimilarity(sim); System.out.println("maxDoc=" + reader.maxDoc() + " numDocs=" + reader.numDocs() + " %tg deletes=" + (100. * reader.maxDoc() / reader.numDocs())); mgr = new SingleIndexSearcher(s); } System.out.println((System.currentTimeMillis() - tSearcherStart) + " msec to init searcher/NRT"); { IndexSearcher s = mgr.acquire(); try { System.out.println("Searcher: numDocs=" + s.getIndexReader().numDocs() + " maxDoc=" + s.getIndexReader().maxDoc() + ": " + s); } finally { mgr.release(s); } } //System.out.println("searcher=" + searcher); FacetsConfig facetsConfig = new FacetsConfig(); facetsConfig.setHierarchical("Date", true); TaxonomyReader taxoReader; Path taxoPath = Paths.get(args.getString("-indexPath"), "facets"); Directory taxoDir = od.open(taxoPath); if (DirectoryReader.indexExists(taxoDir)) { taxoReader = new DirectoryTaxonomyReader(taxoDir); System.out.println("Taxonomy has " + taxoReader.getSize() + " ords"); } else { taxoReader = null; } final Random staticRandom = new Random(staticRandomSeed); final Random random = new Random(randomSeed); final DirectSpellChecker spellChecker = new DirectSpellChecker(); final IndexState indexState = new IndexState(mgr, taxoReader, fieldName, spellChecker, hiliteImpl, facetsConfig); final QueryParser queryParser = new QueryParser("body", a); TaskParser taskParser = new TaskParser(indexState, queryParser, fieldName, topN, staticRandom, doStoredLoads); final TaskSource tasks; if (tasksFile.startsWith("server:")) { int idx = tasksFile.indexOf(':', 8); if (idx == -1) { throw new RuntimeException( "server is missing the port; should be server:interface:port (got: " + tasksFile + ")"); } String iface = tasksFile.substring(7, idx); int port = Integer.valueOf(tasksFile.substring(1 + idx)); RemoteTaskSource remoteTasks = new RemoteTaskSource(iface, port, searchThreadCount, taskParser); // nocommit must stop thread? tasks = remoteTasks; } else { // Load the tasks from a file: final int taskRepeatCount = args.getInt("-taskRepeatCount"); final int numTaskPerCat = args.getInt("-tasksPerCat"); tasks = new LocalTaskSource(indexState, taskParser, tasksFile, staticRandom, random, numTaskPerCat, taskRepeatCount, doPKLookup); System.out.println("Task repeat count " + taskRepeatCount); System.out.println("Tasks file " + tasksFile); System.out.println("Num task per cat " + numTaskPerCat); } args.check(); // Evil respeller: //spellChecker.setMinPrefix(0); //spellChecker.setMaxInspections(1024); final TaskThreads taskThreads = new TaskThreads(tasks, indexState, searchThreadCount); Thread.sleep(10); final long startNanos = System.nanoTime(); taskThreads.start(); taskThreads.finish(); final long endNanos = System.nanoTime(); System.out.println("\n" + ((endNanos - startNanos) / 1000000.0) + " msec total"); final List<Task> allTasks = tasks.getAllTasks(); PrintStream out = new PrintStream(logFile); if (allTasks != null) { // Tasks were local: verify checksums: // indexState.setDocIDToID(); final Map<Task, Task> tasksSeen = new HashMap<Task, Task>(); out.println("\nResults for " + allTasks.size() + " tasks:"); boolean fail = false; for (final Task task : allTasks) { if (verifyCheckSum) { final Task other = tasksSeen.get(task); if (other != null) { if (task.checksum() != other.checksum()) { System.out.println("\nTASK:"); task.printResults(System.out, indexState); System.out.println("\nOTHER TASK:"); other.printResults(System.out, indexState); fail = true; //throw new RuntimeException("task " + task + " hit different checksums: " + task.checksum() + " vs " + other.checksum() + " other=" + other); } } else { tasksSeen.put(task, task); } } out.println("\nTASK: " + task); out.println(" " + (task.runTimeNanos / 1000000.0) + " msec"); out.println(" thread " + task.threadID); task.printResults(out, indexState); } if (fail) { throw new RuntimeException("some tasks got different results across different threads"); } allTasks.clear(); } mgr.close(); if (taxoReader != null) { taxoReader.close(); } if (writer != null) { // Don't actually commit any index changes: writer.rollback(); } dir.close(); if (printHeap) { // Try to get RAM usage -- some ideas poached from http://www.javaworld.com/javaworld/javatips/jw-javatip130.html final Runtime runtime = Runtime.getRuntime(); long usedMem1 = PerfUtils.usedMemory(runtime); long usedMem2 = Long.MAX_VALUE; for (int iter = 0; iter < 10; iter++) { runtime.runFinalization(); runtime.gc(); Thread.yield(); Thread.sleep(100); usedMem2 = usedMem1; usedMem1 = PerfUtils.usedMemory(runtime); } out.println("\nHEAP: " + PerfUtils.usedMemory(runtime)); } out.close(); }