List of usage examples for org.apache.lucene.index IndexWriter forceMerge
public void forceMerge(int maxNumSegments) throws IOException
From source file:org.opensolaris.opengrok.index.IndexDatabase.java
License:Open Source License
/** * Optimize the index database//from ww w .j a v a2 s. co m */ public void optimize() { synchronized (lock) { if (running) { log.warning("Optimize terminated... Someone else is updating / optimizing it!"); return; } running = true; } IndexWriter wrt = null; try { log.info("Optimizing the index ... "); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig conf = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer); conf.setOpenMode(OpenMode.CREATE_OR_APPEND); wrt = new IndexWriter(indexDirectory, conf); wrt.forceMerge(1); // this is deprecated and not needed anymore log.info("done"); synchronized (lock) { if (dirtyFile.exists() && !dirtyFile.delete()) { log.log(Level.FINE, "Failed to remove \"dirty-file\": {0}", dirtyFile.getAbsolutePath()); } dirty = false; } } catch (IOException e) { log.log(Level.SEVERE, "ERROR: optimizing index: {0}", e); } finally { if (wrt != null) { try { wrt.close(); } catch (IOException e) { log.log(Level.WARNING, "An error occured while closing writer", e); } } synchronized (lock) { running = false; } } }
From source file:org.punksearch.crawler.IndexOperator.java
License:Open Source License
public static void optimize(String dir) { IndexWriter iw; try {/* w w w . ja v a2 s. c om*/ iw = createIndexWriter(dir); iw.forceMerge(5); iw.close(); } catch (IOException e) { log.error("Exception during optimizing index directory '" + dir + "': " + e.getMessage()); } }
From source file:org.scify.NewSumServer.Server.Searching.Indexer.java
License:Apache License
/** * The Main method of the Indexer Class. * Traverses a directory and creates the index files needed for the package to * operate.// w w w. j ava2s . com * @throws CorruptIndexException * @throws LockObtainFailedException * @throws IOException */ public void createIndex() throws CorruptIndexException, LockObtainFailedException, IOException { // The dir containing the Files to Index File docDir = new File(this.sFilesPath); Directory FSDir = FSDirectory.open(indexDir); //init the Analyzer, according to locale if (lLoc.toString().equals("el")) { anal = new GreekAnalyzer(Version.LUCENE_36); } else if (lLoc.toString().equals("en")) { // The standard analyzer Analyzer stdAnal = new StandardAnalyzer(Version.LUCENE_36); // In order to index all the text in a field, // however long that field may be anal = new LimitTokenCountAnalyzer(stdAnal, Integer.MAX_VALUE); } // The configuration for the Index Writer IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, anal); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // The Index Writer IndexWriter indexWriter = new IndexWriter(FSDir, conf); // For each File in the dir, create a Document for (File file : getFilesFromFirstLeverSubdirs(docDir)) { String filename = file.getName(); String fullFileName = file.getAbsolutePath(); String tmpText = Utilities.readFromFile(fullFileName, " "); Document d = new Document(); //lucene Document // Add the "filename" field d.add(new Field(FILE_FIELD, filename, Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add The "Text" Field d.add(new Field(TEXT_FIELD, tmpText, Field.Store.YES, Field.Index.ANALYZED)); // Add the Document to the Writer indexWriter.addDocument(d); } int numDocs = indexWriter.numDocs(); // the index will be merged down into a single segment, resulting in // a smaller index with better search performance. Costly Operation, // DO NOT USE on large dirs or when low disk space (needs (2-3)*DirSize) indexWriter.forceMerge(1); // Syncs All referenced Index Files. // At this point old indexes will be deleted, freeing up space indexWriter.commit(); // Terminate the Writer appropriately indexWriter.close(); // LOGGER.log(Level.INFO, "Succesfully closed indexWriter with {0}", anal.toString()); }
From source file:org.voyanttools.trombone.input.index.LuceneIndexer.java
License:Open Source License
private void indexStream(Collection<StoredDocumentSource> storedDocumentSourceForLucene, String corpusId) throws CorruptIndexException, LockObtainFailedException, IOException { // index documents (or at least add corpus to document if not already there), we need to get a new writer IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(corpusId); DirectoryReader indexReader = DirectoryReader.open(indexWriter); IndexSearcher indexSearcher = new IndexSearcher(indexReader); boolean verbose = parameters.getParameterBooleanValue("verbose"); try {/*from ww w .ja v a2 s . c om*/ storedDocumentSourceForLucene.parallelStream().forEach(storedDocumentSource -> { Runnable runnable; try { runnable = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher, storedDocumentSource, corpusId, verbose); runnable.run(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }); } catch (Exception e) { e.printStackTrace(); } if (parameters.containsKey("forceMerge")) { indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge")); } indexReader = DirectoryReader.open(indexWriter); storage.getLuceneManager().setDirectoryReader(corpusId, indexReader); // make sure it's available afterwards // now determine which documents need to be analyzed Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>(); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze storedDocumentSourceForAnalysis.add(storedDocumentSource); } } if (storedDocumentSourceForAnalysis.isEmpty() == false) { IndexSearcher indexSearcher2 = new IndexSearcher(indexReader); try { storedDocumentSourceForAnalysis.parallelStream().forEach(storedDocumentSource -> { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze Runnable worker; try { worker = new IndexedDocumentAnalyzer(storage, indexSearcher2, storedDocumentSource, corpusId, verbose); worker.run(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }); } catch (Exception e) { e.printStackTrace(); } } }
From source file:org.voyanttools.trombone.input.index.LuceneIndexer.java
License:Open Source License
private void indexExecutorService(Collection<StoredDocumentSource> storedDocumentSourceForLucene, String corpusId) throws CorruptIndexException, LockObtainFailedException, IOException { // index documents (or at least add corpus to document if not already there), we need to get a new writer IndexWriter indexWriter = storage.getLuceneManager().getIndexWriter(corpusId); DirectoryReader indexReader = DirectoryReader.open(indexWriter); IndexSearcher indexSearcher = new IndexSearcher(indexReader); boolean verbose = parameters.getParameterBooleanValue("verbose"); int processors = Runtime.getRuntime().availableProcessors(); ExecutorService executor;//from w w w . jav a 2s .c o m // index executor = Executors.newFixedThreadPool(processors); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) { Runnable worker = new StoredDocumentSourceIndexer(storage, indexWriter, indexSearcher, storedDocumentSource, corpusId, verbose); try { executor.execute(worker); } catch (Exception e) { executor.shutdown(); throw e; } } executor.shutdown(); try { if (!executor.awaitTermination(parameters.getParameterIntValue("luceneIndexingTimeout", 60 * 10), TimeUnit.SECONDS)) { // default 10 minutes executor.shutdownNow(); throw new InterruptedException("Lucene indexing has run out of time."); } } catch (InterruptedException e) { executor.shutdownNow(); Thread.currentThread().interrupt(); throw new RuntimeException("Lucene indexing has been interrupted.", e); } finally { try { indexWriter.commit(); } catch (IOException e) { indexWriter.close(); // this may also throw an exception, but docs say to close on commit error throw e; } } if (parameters.containsKey("forceMerge")) { indexWriter.forceMerge(parameters.getParameterIntValue("forceMerge")); } indexReader = DirectoryReader.open(indexWriter); storage.getLuceneManager().setDirectoryReader(corpusId, indexReader); // make sure it's available afterwards // now determine which documents need to be analyzed Collection<StoredDocumentSource> storedDocumentSourceForAnalysis = new ArrayList<StoredDocumentSource>(); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForLucene) { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze storedDocumentSourceForAnalysis.add(storedDocumentSource); } } if (storedDocumentSourceForAnalysis.isEmpty() == false) { indexSearcher = new IndexSearcher(indexReader); executor = Executors.newFixedThreadPool(processors); for (StoredDocumentSource storedDocumentSource : storedDocumentSourceForAnalysis) { if (storedDocumentSource.getMetadata().getLastTokenPositionIndex(TokenType.lexical) == 0) { // don't re-analyze Runnable worker = new IndexedDocumentAnalyzer(storage, indexSearcher, storedDocumentSource, corpusId, verbose); executor.execute(worker); } } executor.shutdown(); try { if (!executor.awaitTermination(parameters.getParameterIntValue("luceneAnalysisTimeout", 60 * 10), TimeUnit.SECONDS)) { // default 10 minutes throw new InterruptedException("Lucene analysis has run out of time."); } } catch (InterruptedException e) { throw new RuntimeException("Lucene document analysis run out of time", e); } } }
From source file:perf.IndexAndSearchOpenStreetMaps.java
License:Apache License
private static void createIndex(boolean fast, boolean doForceMerge, boolean doDistanceSort) throws IOException, InterruptedException { CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT) .onUnmappableCharacter(CodingErrorAction.REPORT); int BUFFER_SIZE = 1 << 16; // 64K InputStream is;/*from w w w .j av a 2 s .co m*/ if (SMALL) { is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.subsetPlusAllLondon.txt")); } else { is = Files.newInputStream(Paths.get(DATA_LOCATION, "latlon.txt")); } BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE); int NUM_THREADS; if (fast) { NUM_THREADS = 4; } else { NUM_THREADS = 1; } int CHUNK = 10000; long t0 = System.nanoTime(); AtomicLong totalCount = new AtomicLong(); for (int part = 0; part < NUM_PARTS; part++) { Directory dir = FSDirectory.open(Paths.get(getName(part, doDistanceSort))); IndexWriterConfig iwc = new IndexWriterConfig(null); iwc.setCodec(getCodec(fast)); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); if (fast) { ((TieredMergePolicy) iwc.getMergePolicy()).setMaxMergedSegmentMB(Double.POSITIVE_INFINITY); iwc.setRAMBufferSizeMB(1024); } else { iwc.setMaxBufferedDocs(109630); iwc.setMergePolicy(new LogDocMergePolicy()); iwc.setMergeScheduler(new SerialMergeScheduler()); } iwc.setInfoStream(new PrintStreamInfoStream(System.out)); IndexWriter w = new IndexWriter(dir, iwc); Thread[] threads = new Thread[NUM_THREADS]; AtomicBoolean finished = new AtomicBoolean(); Object lock = new Object(); final int finalPart = part; for (int t = 0; t < NUM_THREADS; t++) { threads[t] = new Thread() { @Override public void run() { String[] lines = new String[CHUNK]; int chunkCount = 0; while (finished.get() == false) { try { int count = CHUNK; synchronized (lock) { for (int i = 0; i < CHUNK; i++) { String line = reader.readLine(); if (line == null) { count = i; finished.set(true); break; } lines[i] = line; } if (finalPart == 0 && totalCount.get() + count >= 2000000000) { finished.set(true); } } for (int i = 0; i < count; i++) { String[] parts = lines[i].split(","); //long id = Long.parseLong(parts[0]); double lat = Double.parseDouble(parts[1]); double lon = Double.parseDouble(parts[2]); Document doc = new Document(); if (useGeoPoint) { doc.add(new GeoPointField("point", lat, lon, Field.Store.NO)); } else if (useGeo3D || useGeo3DLarge) { doc.add(new Geo3DPoint("point", lat, lon)); } else { doc.add(new LatLonPoint("point", lat, lon)); if (doDistanceSort) { doc.add(new LatLonDocValuesField("point", lat, lon)); } } w.addDocument(doc); long x = totalCount.incrementAndGet(); if (x % 1000000 == 0) { System.out.println(x + "..."); } } chunkCount++; if (false && SMALL == false && chunkCount == 20000) { System.out.println("NOW BREAK EARLY"); break; } } catch (IOException ioe) { throw new RuntimeException(ioe); } } } }; threads[t].start(); } for (Thread thread : threads) { thread.join(); } System.out.println("Part " + part + " is done: w.maxDoc()=" + w.maxDoc()); w.commit(); System.out.println("done commit"); long t1 = System.nanoTime(); System.out.println(((t1 - t0) / 1000000000.0) + " sec to index part " + part); if (doForceMerge) { w.forceMerge(1); long t2 = System.nanoTime(); System.out.println(((t2 - t1) / 1000000000.0) + " sec to force merge part " + part); } w.close(); } //System.out.println(totalCount.get() + " total docs"); //System.out.println("Force merge..."); //w.forceMerge(1); //long t2 = System.nanoTime(); //System.out.println(((t2-t1)/1000000000.0) + " sec to force merge"); //w.close(); //long t3 = System.nanoTime(); //System.out.println(((t3-t2)/1000000000.0) + " sec to close"); //System.out.println(((t3-t2)/1000000000.0) + " sec to close"); }
From source file:perf.Indexer.java
License:Apache License
private static void _main(String[] clArgs) throws Exception { Args args = new Args(clArgs); // EG: -facets Date -facets characterCount ... FacetsConfig facetsConfig = new FacetsConfig(); facetsConfig.setHierarchical("Date", true); final Set<String> facetFields = new HashSet<String>(); if (args.hasArg("-facets")) { for (String arg : args.getStrings("-facets")) { facetFields.add(arg);/*from w w w . j a v a 2 s .c o m*/ } } final String dirImpl = args.getString("-dirImpl"); final String dirPath = args.getString("-indexPath") + "/index"; final Directory dir; OpenDirectory od = OpenDirectory.get(dirImpl); dir = od.open(Paths.get(dirPath)); final String analyzer = args.getString("-analyzer"); final Analyzer a; if (analyzer.equals("EnglishAnalyzer")) { a = new EnglishAnalyzer(); } else if (analyzer.equals("StandardAnalyzer")) { a = new StandardAnalyzer(); } else if (analyzer.equals("StandardAnalyzerNoStopWords")) { a = new StandardAnalyzer(CharArraySet.EMPTY_SET); } else if (analyzer.equals("ShingleStandardAnalyzer")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(), 2, 2); } else if (analyzer.equals("ShingleStandardAnalyzerNoStopWords")) { a = new ShingleAnalyzerWrapper(new StandardAnalyzer(CharArraySet.EMPTY_SET), 2, 2); } else { throw new RuntimeException("unknown analyzer " + analyzer); } final String lineFile = args.getString("-lineDocsFile"); // -1 means all docs in the line file: final int docCountLimit = args.getInt("-docCountLimit"); final int numThreads = args.getInt("-threadCount"); final boolean doForceMerge = args.getFlag("-forceMerge"); final boolean verbose = args.getFlag("-verbose"); String indexSortField = null; SortField.Type indexSortType = null; if (args.hasArg("-indexSort")) { indexSortField = args.getString("-indexSort"); int i = indexSortField.indexOf(':'); if (i == -1) { throw new IllegalArgumentException( "-indexSort should have form field:type; got: " + indexSortField); } String typeString = indexSortField.substring(i + 1, indexSortField.length()); if (typeString.equals("long")) { indexSortType = SortField.Type.LONG; } else if (typeString.equals("string")) { indexSortType = SortField.Type.STRING; } else { throw new IllegalArgumentException("-indexSort can only handle 'long' sort; got: " + typeString); } indexSortField = indexSortField.substring(0, i); } else { indexSortType = null; } final double ramBufferSizeMB = args.getDouble("-ramBufferMB"); final int maxBufferedDocs = args.getInt("-maxBufferedDocs"); final String defaultPostingsFormat = args.getString("-postingsFormat"); final boolean doDeletions = args.getFlag("-deletions"); final boolean printDPS = args.getFlag("-printDPS"); final boolean waitForMerges = args.getFlag("-waitForMerges"); final boolean waitForCommit = args.getFlag("-waitForCommit"); final String mergePolicy = args.getString("-mergePolicy"); final Mode mode; final boolean doUpdate = args.getFlag("-update"); if (doUpdate) { mode = Mode.UPDATE; } else { mode = Mode.valueOf(args.getString("-mode", "add").toUpperCase(Locale.ROOT)); } int randomDocIDMax; if (mode == Mode.UPDATE) { randomDocIDMax = args.getInt("-randomDocIDMax"); } else { randomDocIDMax = -1; } final String idFieldPostingsFormat = args.getString("-idFieldPostingsFormat"); final boolean addGroupingFields = args.getFlag("-grouping"); final boolean useCFS = args.getFlag("-cfs"); final boolean storeBody = args.getFlag("-store"); final boolean tvsBody = args.getFlag("-tvs"); final boolean bodyPostingsOffsets = args.getFlag("-bodyPostingsOffsets"); final int maxConcurrentMerges = args.getInt("-maxConcurrentMerges"); final boolean addDVFields = args.getFlag("-dvfields"); final boolean doRandomCommit = args.getFlag("-randomCommit"); final boolean useCMS = args.getFlag("-useCMS"); final boolean disableIOThrottle = args.getFlag("-disableIOThrottle"); if (waitForCommit == false && waitForMerges) { throw new RuntimeException("pass -waitForCommit if you pass -waitForMerges"); } if (waitForCommit == false && doForceMerge) { throw new RuntimeException("pass -waitForCommit if you pass -forceMerge"); } if (waitForCommit == false && doDeletions) { throw new RuntimeException("pass -waitForCommit if you pass -deletions"); } if (useCMS == false && disableIOThrottle) { throw new RuntimeException("-disableIOThrottle only makes sense with -useCMS"); } final double nrtEverySec; if (args.hasArg("-nrtEverySec")) { nrtEverySec = args.getDouble("-nrtEverySec"); } else { nrtEverySec = -1.0; } // True to start back at the beginning if we run out of // docs from the line file source: final boolean repeatDocs = args.getFlag("-repeatDocs"); final String facetDVFormatName; if (facetFields.isEmpty()) { facetDVFormatName = "Lucene54"; } else { facetDVFormatName = args.getString("-facetDVFormat"); } if (addGroupingFields && docCountLimit == -1) { a.close(); throw new RuntimeException("cannot add grouping fields unless docCount is set"); } args.check(); System.out.println("Dir: " + dirImpl); System.out.println("Index path: " + dirPath); System.out.println("Analyzer: " + analyzer); System.out.println("Line file: " + lineFile); System.out.println("Doc count limit: " + (docCountLimit == -1 ? "all docs" : "" + docCountLimit)); System.out.println("Threads: " + numThreads); System.out.println("Force merge: " + (doForceMerge ? "yes" : "no")); System.out.println("Verbose: " + (verbose ? "yes" : "no")); System.out.println("RAM Buffer MB: " + ramBufferSizeMB); System.out.println("Max buffered docs: " + maxBufferedDocs); System.out.println("Default postings format: " + defaultPostingsFormat); System.out.println("Do deletions: " + (doDeletions ? "yes" : "no")); System.out.println("Wait for merges: " + (waitForMerges ? "yes" : "no")); System.out.println("Wait for commit: " + (waitForCommit ? "yes" : "no")); System.out.println("IO throttle: " + (disableIOThrottle ? "no" : "yes")); System.out.println("Merge policy: " + mergePolicy); System.out.println("Mode: " + mode); if (mode == Mode.UPDATE) { System.out.println("DocIDMax: " + randomDocIDMax); } System.out.println("ID field postings format: " + idFieldPostingsFormat); System.out.println("Add grouping fields: " + (addGroupingFields ? "yes" : "no")); System.out.println("Compound file format: " + (useCFS ? "yes" : "no")); System.out.println("Store body field: " + (storeBody ? "yes" : "no")); System.out.println("Term vectors for body field: " + (tvsBody ? "yes" : "no")); System.out.println("Facet DV Format: " + facetDVFormatName); System.out.println("Facet fields: " + facetFields); System.out.println("Body postings offsets: " + (bodyPostingsOffsets ? "yes" : "no")); System.out.println("Max concurrent merges: " + maxConcurrentMerges); System.out.println("Add DocValues fields: " + addDVFields); System.out.println("Use ConcurrentMergeScheduler: " + useCMS); if (nrtEverySec > 0.0) { System.out.println("Open & close NRT reader every: " + nrtEverySec + " sec"); } else { System.out.println("Open & close NRT reader every: never"); } System.out.println("Repeat docs: " + repeatDocs); if (verbose) { InfoStream.setDefault(new PrintStreamInfoStream(System.out)); } final IndexWriterConfig iwc = new IndexWriterConfig(a); if (indexSortField != null) { iwc.setIndexSort(new Sort(new SortField(indexSortField, indexSortType))); } if (mode == Mode.UPDATE) { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } iwc.setMaxBufferedDocs(maxBufferedDocs); iwc.setRAMBufferSizeMB(ramBufferSizeMB); // So flushed segments do/don't use CFS: iwc.setUseCompoundFile(useCFS); final AtomicBoolean indexingFailed = new AtomicBoolean(); iwc.setMergeScheduler(getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle)); iwc.setMergePolicy(getMergePolicy(mergePolicy, useCFS)); // Keep all commit points: if (doDeletions || doForceMerge) { iwc.setIndexDeletionPolicy(NoDeletionPolicy.INSTANCE); } final Codec codec = new Lucene62Codec() { @Override public PostingsFormat getPostingsFormatForField(String field) { return PostingsFormat.forName(field.equals("id") ? idFieldPostingsFormat : defaultPostingsFormat); } private final DocValuesFormat facetsDVFormat = DocValuesFormat.forName(facetDVFormatName); //private final DocValuesFormat lucene42DVFormat = DocValuesFormat.forName("Lucene42"); //private final DocValuesFormat diskDVFormat = DocValuesFormat.forName("Disk"); // private final DocValuesFormat lucene45DVFormat = DocValuesFormat.forName("Lucene45"); private final DocValuesFormat directDVFormat = DocValuesFormat.forName("Direct"); @Override public DocValuesFormat getDocValuesFormatForField(String field) { if (facetFields.contains(field) || field.equals("$facets")) { return facetsDVFormat; //} else if (field.equals("$facets_sorted_doc_values")) { //return diskDVFormat; } else { // Use default DVFormat for all else: // System.out.println("DV: field=" + field + " format=" + super.getDocValuesFormatForField(field)); return super.getDocValuesFormatForField(field); } } }; iwc.setCodec(codec); System.out.println("IW config=" + iwc); IndexWriter w = new IndexWriter(dir, iwc); System.out.println("Index has " + w.maxDoc() + " docs"); final TaxonomyWriter taxoWriter; if (facetFields.isEmpty() == false) { taxoWriter = new DirectoryTaxonomyWriter(od.open(Paths.get(args.getString("-indexPath"), "facets")), IndexWriterConfig.OpenMode.CREATE); } else { taxoWriter = null; } // Fixed seed so group field values are always consistent: final Random random = new Random(17); LineFileDocs lineFileDocs = new LineFileDocs(lineFile, repeatDocs, storeBody, tvsBody, bodyPostingsOffsets, false, taxoWriter, facetFields, facetsConfig, addDVFields); float docsPerSecPerThread = -1f; //float docsPerSecPerThread = 100f; IndexThreads threads = new IndexThreads(random, w, indexingFailed, lineFileDocs, numThreads, docCountLimit, addGroupingFields, printDPS, mode, docsPerSecPerThread, null, nrtEverySec, randomDocIDMax); System.out.println("\nIndexer: start"); final long t0 = System.currentTimeMillis(); threads.start(); while (!threads.done() && indexingFailed.get() == false) { Thread.sleep(100); // Commits once per minute on average: if (doRandomCommit && random.nextInt(600) == 17) { System.out.println("Indexer: now commit"); long commitStartNS = System.nanoTime(); w.commit(); System.out.println(String.format(Locale.ROOT, "Indexer: commit took %.1f msec", (System.nanoTime() - commitStartNS) / 1000000.)); } } threads.stop(); final long t1 = System.currentTimeMillis(); System.out.println("\nIndexer: indexing done (" + (t1 - t0) + " msec); total " + w.maxDoc() + " docs"); // if we update we can not tell how many docs if (threads.failed.get()) { throw new RuntimeException("exceptions during indexing"); } if (mode != Mode.UPDATE && docCountLimit != -1 && w.maxDoc() != docCountLimit) { throw new RuntimeException("w.maxDoc()=" + w.maxDoc() + " but expected " + docCountLimit); } final Map<String, String> commitData = new HashMap<String, String>(); if (waitForMerges) { w.close(); IndexWriterConfig iwc2 = new IndexWriterConfig(a); iwc2.setMergeScheduler( getMergeScheduler(indexingFailed, useCMS, maxConcurrentMerges, disableIOThrottle)); iwc2.setMergePolicy(getMergePolicy(mergePolicy, useCFS)); iwc2.setCodec(codec); iwc2.setUseCompoundFile(useCFS); iwc2.setMaxBufferedDocs(maxBufferedDocs); iwc2.setRAMBufferSizeMB(ramBufferSizeMB); if (indexSortField != null) { iwc2.setIndexSort(new Sort(new SortField(indexSortField, indexSortType))); } w = new IndexWriter(dir, iwc2); long t2 = System.currentTimeMillis(); System.out.println("\nIndexer: waitForMerges done (" + (t2 - t1) + " msec)"); } if (waitForCommit) { commitData.put("userData", "multi"); w.setLiveCommitData(commitData.entrySet()); long t2 = System.currentTimeMillis(); w.commit(); long t3 = System.currentTimeMillis(); System.out.println("\nIndexer: commit multi (took " + (t3 - t2) + " msec)"); } else { w.rollback(); w = null; } if (doForceMerge) { long forceMergeStartMSec = System.currentTimeMillis(); w.forceMerge(1); long forceMergeEndMSec = System.currentTimeMillis(); System.out.println( "\nIndexer: force merge done (took " + (forceMergeEndMSec - forceMergeStartMSec) + " msec)"); commitData.put("userData", "single"); w.setLiveCommitData(commitData.entrySet()); w.commit(); final long t5 = System.currentTimeMillis(); System.out.println("\nIndexer: commit single done (took " + (t5 - forceMergeEndMSec) + " msec)"); } if (doDeletions) { final long t5 = System.currentTimeMillis(); // Randomly delete 5% of the docs final Set<Integer> deleted = new HashSet<Integer>(); final int maxDoc = w.maxDoc(); final int toDeleteCount = (int) (maxDoc * 0.05); System.out.println("\nIndexer: delete " + toDeleteCount + " docs"); while (deleted.size() < toDeleteCount) { final int id = random.nextInt(maxDoc); if (!deleted.contains(id)) { deleted.add(id); w.deleteDocuments(new Term("id", LineFileDocs.intToID(id))); } } final long t6 = System.currentTimeMillis(); System.out.println("\nIndexer: deletes done (took " + (t6 - t5) + " msec)"); commitData.put("userData", doForceMerge ? "delsingle" : "delmulti"); w.setLiveCommitData(commitData.entrySet()); w.commit(); final long t7 = System.currentTimeMillis(); System.out.println("\nIndexer: commit delmulti done (took " + (t7 - t6) + " msec)"); if (doUpdate || w.numDocs() != maxDoc - toDeleteCount) { throw new RuntimeException( "count mismatch: w.numDocs()=" + w.numDocs() + " but expected " + (maxDoc - toDeleteCount)); } } if (taxoWriter != null) { System.out.println("Taxonomy has " + taxoWriter.getSize() + " ords"); taxoWriter.commit(); taxoWriter.close(); } final long tCloseStart = System.currentTimeMillis(); if (w != null) { w.close(); w = null; } if (waitForCommit) { System.out.println("\nIndexer: at close: " + SegmentInfos.readLatestCommit(dir)); System.out.println("\nIndexer: close took " + (System.currentTimeMillis() - tCloseStart) + " msec"); } dir.close(); final long tFinal = System.currentTimeMillis(); System.out.println("\nIndexer: net bytes indexed " + threads.getBytesIndexed()); final long indexingTime; if (waitForCommit) { indexingTime = tFinal - t0; System.out.println("\nIndexer: finished (" + indexingTime + " msec)"); } else { indexingTime = t1 - t0; System.out.println("\nIndexer: finished (" + indexingTime + " msec), excluding commit"); } System.out.println( "\nIndexer: " + (threads.getBytesIndexed() / 1024. / 1024. / 1024. / (indexingTime / 3600000.)) + " GB/hour plain text"); }
From source file:perf.PrintPerFieldHeapUsage.java
License:Apache License
public static void main(String[] args) throws IOException { Directory dir = FSDirectory.open(Paths.get("fields")); int fieldUpto; IndexWriterConfig iwc;// w ww. j ava 2 s. c om IndexWriter w; long t0; IndexReader r; // Stored field: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new StoredField("f" + fieldUpto, "text" + i)); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StoredField: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); // Indexed StringField: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new StringField("f" + fieldUpto, "text" + i, Field.Store.NO)); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique StringField: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); // Numeric DV field: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new NumericDocValuesField("f" + fieldUpto, i)); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique NumericDocValuesField, latent: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); // Now force lazy loading of all the DV fields: for (int i = 0; i < FIELD_COUNT; i++) { MultiDocValues.getNumericValues(r, "f" + i); } System.out.println(String.format(Locale.ROOT, "Bytes per unique NumericDocValuesField, loaded: %.1f", (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); // Sorted DV field: iwc = new IndexWriterConfig(new WhitespaceAnalyzer()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); w = new IndexWriter(dir, iwc); fieldUpto = 0; t0 = System.nanoTime(); for (int i = 0; i < FIELD_COUNT; i++) { Document doc = new Document(); doc.add(new SortedDocValuesField("f" + fieldUpto, new BytesRef("text" + i))); fieldUpto++; w.addDocument(doc); } w.forceMerge(1); w.close(); r = DirectoryReader.open(dir); System.out.println(String.format(Locale.ROOT, "Took %.1f sec; bytes per unique SortedDocValuesField, latent: %.1f", (System.nanoTime() - t0) / 1000000000.0, (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); // Now force lazy loading of all the DV fields: for (int i = 0; i < FIELD_COUNT; i++) { MultiDocValues.getSortedValues(r, "f" + i); } System.out.println(String.format(Locale.ROOT, "Bytes per unique SortedDocValuesField, loaded: %.1f", (RamUsageTester.sizeOf(r) / (double) FIELD_COUNT))); r.close(); dir.close(); }
From source file:semanticRelatedness.MakeLuceneIndex.java
License:Apache License
/** Index all text files under a directory. * @throws UnsupportedEncodingException * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { String baseDir = "/home/chrisschaefer/"; //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2"; String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2"; String luceneIndexName = "enwiki-20130604-lucene2"; System.currentTimeMillis();// www .java 2s. co m boolean bIgnoreStubs = false; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-luceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; if (args[i].equals("-dumpfile")) wikiDumpFile = args[++i]; if (args[i].equals("-includestubs")) bIgnoreStubs = true; } String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt"; String logPath = baseDir + luceneIndexName + ".log"; PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8"); PrintWriter logger = new PrintWriter(logPath, "UTF-8"); logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); Date start = new Date(); try { Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName)); // Analyzer analyzer = new WikipediaAnalyzer(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); iwc.setSimilarity(new DefaultSimilarity()); //iwc.setSimilarity(new ESASimilarity()); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // iwc.setRAMBufferSizeMB(2000.0); IndexWriter writer = new IndexWriter(dir, iwc); Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile); wikidumpExtractor.setLinkSeparator("_"); wikidumpExtractor.setCategorySeparator("_"); wikidumpExtractor.setTitleSeparator(" "); int iStubs = 0; int iArticleCount = 0; int iSkippedPageCount = 0; long iStartTime = java.lang.System.nanoTime(); long iTime = iStartTime; while (wikidumpExtractor.nextPage()) { if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) { ++iSkippedPageCount; continue; } if (bIgnoreStubs && wikidumpExtractor.getStub()) { ++iStubs; continue; } // skip pages with less than 5 out links if (wikidumpExtractor.getPageLinkList(true).size() < 5) { ++iSkippedPageCount; continue; } if (wikidumpExtractor.getPageCategories().equals("")) { ++iSkippedPageCount; logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false)); continue; } else { for (String link : wikidumpExtractor.getPageLinkList(false)) { // artikelTextWriter.println(link); if (_inLinks.containsKey(link)) { int tmp = _inLinks.get(link); tmp++; _inLinks.put(link, tmp); } else { _inLinks.put(link, 1); } } } if (wikidumpExtractor.getPageText().equals("")) { ++iSkippedPageCount; continue; } artikelTextWriter.println( wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false)); ++iArticleCount; if (iArticleCount % 1000 == 0) { logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } artikelTextWriter.close(); iArticleCount = 0; PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8"); BufferedReader br = new BufferedReader(new FileReader(rawTextPath)); String line = br.readLine(); while (line != null) { int endOfTitle = line.indexOf("\t"); String title = line.substring(0, endOfTitle); if (_inLinks.containsKey(title)) { int inlinks = _inLinks.get(title); artikelInLinkWriter.println(title + "\t" + inlinks); if (inlinks > 4) { //System.out.println("inlinks > 0 "); Document doc = new Document(); ++iArticleCount; // wikidumpExtractor.setTitleSeparator( "_" ); // doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES)); doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " " + line.substring(endOfTitle + 1), Field.Store.NO)); // System.out.println(title + " " + // title + " " + // title + " " + // title + " " + // line.substring(endOfTitle+1)); writer.addDocument(doc); if (iArticleCount % 1000 == 0) { writer.commit(); logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } } else { artikelInLinkWriter.println(title + "\t0"); } line = br.readLine(); } br.close(); artikelInLinkWriter.close(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.commit(); writer.forceMerge(1); writer.close(); Date end = new Date(); String endStatement = end.getTime() - start.getTime() + " total milliseconds (" + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles."; logger.println(endStatement); System.out.println(endStatement); logger.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:tool.export.ExportToLucene.java
License:Open Source License
public static void main(String[] args) throws IOException { String usage = "tool.export.ExportToLucene" + " [-index INDEX_PATH] [-frames FRAMES_PATH]\n\n" + "This indexes the frames in FRAMES_PATH, creating a Lucene index" + "in INDEX_PATH"; String indexPath = "index"; String framesPath = null;/*from w w w. j av a 2 s. c o m*/ for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-frames".equals(args[i])) { framesPath = args[i + 1]; i++; } } if (framesPath == null) { System.err.println("Usage: " + usage); System.exit(1); } Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_42); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_42, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(3072.0); IndexWriter writer = new IndexWriter(dir, iwc); List<PairOfWritables<Text, DoubleWritable>> bigrams = SequenceFileUtils.readDirectory(new Path(framesPath)); for (PairOfWritables<Text, DoubleWritable> bigram : bigrams) { String[] slots = bigram.getLeftElement().toString().trim().split("\t"); double probability = bigram.getRightElement().get(); Document doc = new Document(); for (int i = 0; i < slots.length; i++) { String slot = slots[i]; if (i == 0) { slot = slot.substring(2, slot.length() - 1); } else { slot = slot.substring(1, slot.length() - 1); } String[] slotParts = slot.split(","); if (slotParts.length >= 3) { String slotRelation = slotParts[0]; StringBuilder slotValue = new StringBuilder(); String separator = ""; for (int j = 1; j < slotParts.length - 1; j++) { slotValue.append(separator); slotValue.append(slotParts[j]); separator = ","; } Field slotField = new TextField(slotRelation, slotValue.toString().substring(1, slotValue.toString().length() - 1), Field.Store.YES); doc.add(slotField); } } Field probabilityField = new DoubleField("probability", probability, Field.Store.YES); doc.add(probabilityField); writer.addDocument(doc); } writer.forceMerge(1); writer.close(); }